mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-11 09:00:36 +03:00
Remove jsonl converter
This commit is contained in:
parent
5d89b1840e
commit
f1756a6a22
|
@ -6,7 +6,6 @@ import sys
|
|||
|
||||
from ..tokens import DocBin
|
||||
from ..gold.converters import iob2docs, conll_ner2docs, json2docs
|
||||
from ..gold.converters import ner_jsonl2docs
|
||||
|
||||
|
||||
# Converters are matched by file extension except for ner/iob, which are
|
||||
|
@ -20,7 +19,6 @@ CONVERTERS = {
|
|||
#"conll": conllu2docs, TODO
|
||||
"ner": conll_ner2docs,
|
||||
"iob": iob2docs,
|
||||
"jsonl": ner_jsonl2docs,
|
||||
"json": json2docs,
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
from .iob2docs import iob2docs # noqa: F401
|
||||
from .conll_ner2docs import conll_ner2docs # noqa: F401
|
||||
from .jsonl2docs import ner_jsonl2docs # noqa: F401
|
||||
from .json2docs import json2docs
|
||||
|
||||
# TODO: Update this one
|
||||
|
|
|
@ -1,51 +0,0 @@
|
|||
import srsly
|
||||
|
||||
from ...gold import docs_to_json
|
||||
from ...util import get_lang_class, minibatch
|
||||
|
||||
|
||||
def ner_jsonl2docs(input_data, lang=None, n_sents=10, use_morphology=False, **_):
|
||||
if lang is None:
|
||||
raise ValueError("No --lang specified, but tokenization required")
|
||||
docs = []
|
||||
input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")]
|
||||
nlp = get_lang_class(lang)()
|
||||
sentencizer = nlp.create_pipe("sentencizer")
|
||||
for i, batch in enumerate(minibatch(input_examples, size=n_sents)):
|
||||
docs = []
|
||||
# TODO: Should we be merging these? We're disrespecting the n_sents
|
||||
# currently.
|
||||
for record in batch:
|
||||
raw_text = record["text"]
|
||||
if "entities" in record:
|
||||
ents = record["entities"]
|
||||
else:
|
||||
ents = record["spans"]
|
||||
ents = [(e["start"], e["end"], e["label"]) for e in ents]
|
||||
doc = nlp.make_doc(raw_text)
|
||||
sentencizer(doc)
|
||||
spans = [doc.char_span(s, e, label=L) for s, e, L in ents]
|
||||
doc.ents = _cleanup_spans(spans)
|
||||
docs.append(doc)
|
||||
return docs
|
||||
|
||||
|
||||
def _cleanup_spans(spans):
|
||||
output = []
|
||||
seen = set()
|
||||
for span in spans:
|
||||
if span is not None:
|
||||
# Trim whitespace
|
||||
while len(span) and span[0].is_space:
|
||||
span = span[1:]
|
||||
while len(span) and span[-1].is_space:
|
||||
span = span[:-1]
|
||||
if not len(span):
|
||||
continue
|
||||
for i in range(span.start, span.end):
|
||||
if i in seen:
|
||||
break
|
||||
else:
|
||||
output.append(span)
|
||||
seen.update(range(span.start, span.end))
|
||||
return output
|
Loading…
Reference in New Issue
Block a user