Start updating converters

This commit is contained in:
Matthew Honnibal 2020-06-20 03:19:40 +02:00
parent 6d5bfd6f6a
commit d422f30a18
4 changed files with 51 additions and 61 deletions

View File

@ -1,4 +1,4 @@
from .conllu2json import conllu2json # noqa: F401 from .conllu2json import conllu2json # noqa: F401
from .iob2json import iob2json # noqa: F401 from .iob2json import iob2docs # noqa: F401
from .conll_ner2json import conll_ner2json # noqa: F401 from .conll_ner2json import conll_ner2json # noqa: F401
from .jsonl2json import ner_jsonl2json # noqa: F401 from .jsonl2docs import ner_jsonl2json # noqa: F401

View File

@ -3,15 +3,16 @@ from wasabi import Printer
from ...gold import iob_to_biluo from ...gold import iob_to_biluo
from ...lang.xx import MultiLanguage from ...lang.xx import MultiLanguage
from ...tokens.doc import Doc from ...tokens.doc import Doc
from ...vocab import Vocab
from ...util import load_model from ...util import load_model
def conll_ner2json( def conll_ner2doc(
input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs
): ):
""" """
Convert files in the CoNLL-2003 NER format and similar Convert files in the CoNLL-2003 NER format and similar
whitespace-separated columns into JSON format for use with train cli. whitespace-separated columns into Doc objects.
The first column is the tokens, the final column is the IOB tags. If an The first column is the tokens, the final column is the IOB tags. If an
additional second column is present, the second column is the tags. additional second column is present, the second column is the tags.
@ -81,17 +82,25 @@ def conll_ner2json(
"No document delimiters found. Use `-n` to automatically group " "No document delimiters found. Use `-n` to automatically group "
"sentences into documents." "sentences into documents."
) )
if model:
nlp = load_model(model)
else:
nlp = MultiLanguage()
output_docs = [] output_docs = []
for doc in input_data.strip().split(doc_delimiter): for conll_doc in input_data.strip().split(doc_delimiter):
doc = doc.strip() conll_doc = conll_doc.strip()
if not doc: if not conll_doc:
continue continue
output_doc = [] words = []
for sent in doc.split("\n\n"): sent_starts = []
sent = sent.strip() pos_tags = []
biluo_tags = []
for conll_sent in conll_doc.split("\n\n"):
conll_sent = conll_sent.strip()
if not sent: if not sent:
continue continue
lines = [line.strip() for line in sent.split("\n") if line.strip()] lines = [line.strip() for line in conll_sent.split("\n") if line.strip()]
cols = list(zip(*[line.split() for line in lines])) cols = list(zip(*[line.split() for line in lines]))
if len(cols) < 2: if len(cols) < 2:
raise ValueError( raise ValueError(
@ -99,25 +108,19 @@ def conll_ner2json(
"Try checking whitespace and delimiters. See " "Try checking whitespace and delimiters. See "
"https://spacy.io/api/cli#convert" "https://spacy.io/api/cli#convert"
) )
words = cols[0] length = len(cols[0])
iob_ents = cols[-1] words.extend(cols[0])
if len(cols) > 2: sent_stats.extend([True] + [False] * (length - 1))
tags = cols[1] biluo_tags.extend(iob_to_biluo(cols[-1]))
else: pos_tags.extend(cols[1] if len(cols) > 2 else ["-"] * length)
tags = ["-"] * len(words)
biluo_ents = iob_to_biluo(iob_ents) doc = Doc(nlp.vocab, words=words)
output_doc.append( for i, token in enumerate(doc):
{ token.tag_ = pos_tags[i]
"tokens": [ token.is_sent_start = sent_starts[i]
{"orth": w, "tag": tag, "ner": ent} entities = tags_to_entities(biluo_tags)
for (w, tag, ent) in zip(words, tags, biluo_ents) doc.ents = [Span(doc, start=s, end=e+1, label=L) for L, s, e in entities]
] output_docs.append(doc)
}
)
output_docs.append(
{"id": len(output_docs), "paragraphs": [{"sentences": output_doc}]}
)
output_doc = []
return output_docs return output_docs

View File

@ -1,14 +1,15 @@
from wasabi import Printer from wasabi import Printer
from ...gold import iob_to_biluo from ...gold import iob_to_biluo, tags_to_entities
from ...util import minibatch from ...util import minibatch
from .util import merge_sentences
from .conll_ner2json import n_sents_info from .conll_ner2json import n_sents_info
def iob2json(input_data, n_sents=10, no_print=False, *args, **kwargs): def iob2docs(input_data, n_sents=10, no_print=False, *args, **kwargs):
""" """
Convert IOB files with one sentence per line and tags separated with '|' Convert IOB files with one sentence per line and tags separated with '|'
into JSON format for use with train cli. IOB and IOB2 are accepted. into Doc objects so they can be saved. IOB and IOB2 are accepted.
Sample formats: Sample formats:
@ -26,40 +27,25 @@ def iob2json(input_data, n_sents=10, no_print=False, *args, **kwargs):
def read_iob(raw_sents): def read_iob(raw_sents):
sentences = [] docs = []
for line in raw_sents: for line in raw_sents:
if not line.strip(): if not line.strip():
continue continue
tokens = [t.split("|") for t in line.split()] tokens = [t.split("|") for t in line.split()]
if len(tokens[0]) == 3: if len(tokens[0]) == 3:
words, pos, iob = zip(*tokens) words, tags, iob = zip(*tokens)
elif len(tokens[0]) == 2: elif len(tokens[0]) == 2:
words, iob = zip(*tokens) words, iob = zip(*tokens)
pos = ["-"] * len(words) tags = ["-"] * len(words)
else: else:
raise ValueError( raise ValueError(
"The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert" "The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert"
) )
doc = Doc(vocab, words=words)
for i, tag in enumerate(pos):
doc[i].tag_ = tag
biluo = iob_to_biluo(iob) biluo = iob_to_biluo(iob)
sentences.append( entities = biluo_tags_to_entities(biluo)
[ doc.ents = [Span(doc, start=s, end=e, label=L) for (L, s, e) in entities]
{"orth": w, "tag": p, "ner": ent} docs.append(doc)
for (w, p, ent) in zip(words, pos, biluo)
]
)
sentences = [{"tokens": sent} for sent in sentences]
paragraphs = [{"sentences": [sent]} for sent in sentences]
docs = [{"id": i, "paragraphs": [para]} for i, para in enumerate(paragraphs)]
return docs return docs
def merge_sentences(docs, n_sents):
merged = []
for group in minibatch(docs, size=n_sents):
group = list(group)
first = group.pop(0)
to_extend = first["paragraphs"][0]["sentences"]
for sent in group:
to_extend.extend(sent["paragraphs"][0]["sentences"])
merged.append(first)
return merged

View File

@ -4,15 +4,17 @@ from ...gold import docs_to_json
from ...util import get_lang_class, minibatch from ...util import get_lang_class, minibatch
def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False, **_): def ner_jsonl2docs(input_data, lang=None, n_sents=10, use_morphology=False, **_):
if lang is None: if lang is None:
raise ValueError("No --lang specified, but tokenization required") raise ValueError("No --lang specified, but tokenization required")
json_docs = [] docs = []
input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")] input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")]
nlp = get_lang_class(lang)() nlp = get_lang_class(lang)()
sentencizer = nlp.create_pipe("sentencizer") sentencizer = nlp.create_pipe("sentencizer")
for i, batch in enumerate(minibatch(input_examples, size=n_sents)): for i, batch in enumerate(minibatch(input_examples, size=n_sents)):
docs = [] docs = []
# TODO: Should we be merging these? We're disrespecting the n_sents
# currently.
for record in batch: for record in batch:
raw_text = record["text"] raw_text = record["text"]
if "entities" in record: if "entities" in record:
@ -25,8 +27,7 @@ def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False, **_)
spans = [doc.char_span(s, e, label=L) for s, e, L in ents] spans = [doc.char_span(s, e, label=L) for s, e, L in ents]
doc.ents = _cleanup_spans(spans) doc.ents = _cleanup_spans(spans)
docs.append(doc) docs.append(doc)
json_docs.append(docs_to_json(docs, id=i)) return docs
return json_docs
def _cleanup_spans(spans): def _cleanup_spans(spans):