Start updating converters

This commit is contained in:
Matthew Honnibal 2020-06-20 03:19:40 +02:00
parent 6d5bfd6f6a
commit d422f30a18
4 changed files with 51 additions and 61 deletions

View File

@ -1,4 +1,4 @@
from .conllu2json import conllu2json # noqa: F401
from .iob2json import iob2json # noqa: F401
from .iob2json import iob2docs # noqa: F401
from .conll_ner2json import conll_ner2json # noqa: F401
from .jsonl2json import ner_jsonl2json # noqa: F401
from .jsonl2docs import ner_jsonl2json # noqa: F401

View File

@ -3,15 +3,16 @@ from wasabi import Printer
from ...gold import iob_to_biluo
from ...lang.xx import MultiLanguage
from ...tokens.doc import Doc
from ...vocab import Vocab
from ...util import load_model
def conll_ner2json(
def conll_ner2doc(
input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs
):
"""
Convert files in the CoNLL-2003 NER format and similar
whitespace-separated columns into JSON format for use with train cli.
whitespace-separated columns into Doc objects.
The first column is the tokens, the final column is the IOB tags. If an
additional second column is present, the second column is the tags.
@ -81,17 +82,25 @@ def conll_ner2json(
"No document delimiters found. Use `-n` to automatically group "
"sentences into documents."
)
if model:
nlp = load_model(model)
else:
nlp = MultiLanguage()
output_docs = []
for doc in input_data.strip().split(doc_delimiter):
doc = doc.strip()
if not doc:
for conll_doc in input_data.strip().split(doc_delimiter):
conll_doc = conll_doc.strip()
if not conll_doc:
continue
output_doc = []
for sent in doc.split("\n\n"):
sent = sent.strip()
words = []
sent_starts = []
pos_tags = []
biluo_tags = []
for conll_sent in conll_doc.split("\n\n"):
conll_sent = conll_sent.strip()
if not sent:
continue
lines = [line.strip() for line in sent.split("\n") if line.strip()]
lines = [line.strip() for line in conll_sent.split("\n") if line.strip()]
cols = list(zip(*[line.split() for line in lines]))
if len(cols) < 2:
raise ValueError(
@ -99,25 +108,19 @@ def conll_ner2json(
"Try checking whitespace and delimiters. See "
"https://spacy.io/api/cli#convert"
)
words = cols[0]
iob_ents = cols[-1]
if len(cols) > 2:
tags = cols[1]
else:
tags = ["-"] * len(words)
biluo_ents = iob_to_biluo(iob_ents)
output_doc.append(
{
"tokens": [
{"orth": w, "tag": tag, "ner": ent}
for (w, tag, ent) in zip(words, tags, biluo_ents)
]
}
)
output_docs.append(
{"id": len(output_docs), "paragraphs": [{"sentences": output_doc}]}
)
output_doc = []
length = len(cols[0])
words.extend(cols[0])
sent_stats.extend([True] + [False] * (length - 1))
biluo_tags.extend(iob_to_biluo(cols[-1]))
pos_tags.extend(cols[1] if len(cols) > 2 else ["-"] * length)
doc = Doc(nlp.vocab, words=words)
for i, token in enumerate(doc):
token.tag_ = pos_tags[i]
token.is_sent_start = sent_starts[i]
entities = tags_to_entities(biluo_tags)
doc.ents = [Span(doc, start=s, end=e+1, label=L) for L, s, e in entities]
output_docs.append(doc)
return output_docs

View File

@ -1,14 +1,15 @@
from wasabi import Printer
from ...gold import iob_to_biluo
from ...gold import iob_to_biluo, tags_to_entities
from ...util import minibatch
from .util import merge_sentences
from .conll_ner2json import n_sents_info
def iob2json(input_data, n_sents=10, no_print=False, *args, **kwargs):
def iob2docs(input_data, n_sents=10, no_print=False, *args, **kwargs):
"""
Convert IOB files with one sentence per line and tags separated with '|'
into JSON format for use with train cli. IOB and IOB2 are accepted.
into Doc objects so they can be saved. IOB and IOB2 are accepted.
Sample formats:
@ -26,40 +27,25 @@ def iob2json(input_data, n_sents=10, no_print=False, *args, **kwargs):
def read_iob(raw_sents):
sentences = []
docs = []
for line in raw_sents:
if not line.strip():
continue
tokens = [t.split("|") for t in line.split()]
if len(tokens[0]) == 3:
words, pos, iob = zip(*tokens)
words, tags, iob = zip(*tokens)
elif len(tokens[0]) == 2:
words, iob = zip(*tokens)
pos = ["-"] * len(words)
tags = ["-"] * len(words)
else:
raise ValueError(
"The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert"
)
doc = Doc(vocab, words=words)
for i, tag in enumerate(pos):
doc[i].tag_ = tag
biluo = iob_to_biluo(iob)
sentences.append(
[
{"orth": w, "tag": p, "ner": ent}
for (w, p, ent) in zip(words, pos, biluo)
]
)
sentences = [{"tokens": sent} for sent in sentences]
paragraphs = [{"sentences": [sent]} for sent in sentences]
docs = [{"id": i, "paragraphs": [para]} for i, para in enumerate(paragraphs)]
entities = biluo_tags_to_entities(biluo)
doc.ents = [Span(doc, start=s, end=e, label=L) for (L, s, e) in entities]
docs.append(doc)
return docs
def merge_sentences(docs, n_sents):
merged = []
for group in minibatch(docs, size=n_sents):
group = list(group)
first = group.pop(0)
to_extend = first["paragraphs"][0]["sentences"]
for sent in group:
to_extend.extend(sent["paragraphs"][0]["sentences"])
merged.append(first)
return merged

View File

@ -4,15 +4,17 @@ from ...gold import docs_to_json
from ...util import get_lang_class, minibatch
def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False, **_):
def ner_jsonl2docs(input_data, lang=None, n_sents=10, use_morphology=False, **_):
if lang is None:
raise ValueError("No --lang specified, but tokenization required")
json_docs = []
docs = []
input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")]
nlp = get_lang_class(lang)()
sentencizer = nlp.create_pipe("sentencizer")
for i, batch in enumerate(minibatch(input_examples, size=n_sents)):
docs = []
# TODO: Should we be merging these? We're disrespecting the n_sents
# currently.
for record in batch:
raw_text = record["text"]
if "entities" in record:
@ -25,8 +27,7 @@ def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False, **_)
spans = [doc.char_span(s, e, label=L) for s, e, L in ents]
doc.ents = _cleanup_spans(spans)
docs.append(doc)
json_docs.append(docs_to_json(docs, id=i))
return json_docs
return docs
def _cleanup_spans(spans):