2017-10-10 06:03:26 +03:00
|
|
|
# coding: utf8
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
2019-08-29 13:04:01 +03:00
|
|
|
from wasabi import Printer
|
|
|
|
|
2017-10-10 06:03:26 +03:00
|
|
|
from ...gold import iob_to_biluo
|
2019-08-29 13:04:01 +03:00
|
|
|
from ...lang.xx import MultiLanguage
|
|
|
|
from ...tokens.doc import Doc
|
|
|
|
from ...util import load_model
|
2017-10-10 06:03:26 +03:00
|
|
|
|
|
|
|
|
2019-10-24 17:21:08 +03:00
|
|
|
def conll_ner2json(
|
|
|
|
input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs
|
|
|
|
):
|
2017-10-10 06:03:26 +03:00
|
|
|
"""
|
2019-08-29 13:04:01 +03:00
|
|
|
Convert files in the CoNLL-2003 NER format and similar
|
|
|
|
whitespace-separated columns into JSON format for use with train cli.
|
|
|
|
|
|
|
|
The first column is the tokens, the final column is the IOB tags. If an
|
|
|
|
additional second column is present, the second column is the tags.
|
|
|
|
|
|
|
|
Sentences are separated with whitespace and documents can be separated
|
|
|
|
using the line "-DOCSTART- -X- O O".
|
|
|
|
|
|
|
|
Sample format:
|
|
|
|
|
|
|
|
-DOCSTART- -X- O O
|
|
|
|
|
|
|
|
I O
|
|
|
|
like O
|
|
|
|
London B-GPE
|
|
|
|
and O
|
|
|
|
New B-GPE
|
|
|
|
York I-GPE
|
|
|
|
City I-GPE
|
|
|
|
. O
|
|
|
|
|
2017-10-10 06:03:26 +03:00
|
|
|
"""
|
2019-10-18 19:12:59 +03:00
|
|
|
msg = Printer(no_print=no_print)
|
2019-08-29 13:04:01 +03:00
|
|
|
doc_delimiter = "-DOCSTART- -X- O O"
|
|
|
|
# check for existing delimiters, which should be preserved
|
|
|
|
if "\n\n" in input_data and seg_sents:
|
2019-08-31 14:39:06 +03:00
|
|
|
msg.warn(
|
|
|
|
"Sentence boundaries found, automatic sentence segmentation with "
|
|
|
|
"`-s` disabled."
|
|
|
|
)
|
2019-08-29 13:04:01 +03:00
|
|
|
seg_sents = False
|
|
|
|
if doc_delimiter in input_data and n_sents:
|
2019-08-31 14:39:06 +03:00
|
|
|
msg.warn(
|
|
|
|
"Document delimiters found, automatic document segmentation with "
|
|
|
|
"`-n` disabled."
|
|
|
|
)
|
2019-08-29 13:04:01 +03:00
|
|
|
n_sents = 0
|
|
|
|
# do document segmentation with existing sentences
|
2019-08-31 14:39:06 +03:00
|
|
|
if "\n\n" in input_data and doc_delimiter not in input_data and n_sents:
|
2019-08-29 13:04:01 +03:00
|
|
|
n_sents_info(msg, n_sents)
|
|
|
|
input_data = segment_docs(input_data, n_sents, doc_delimiter)
|
|
|
|
# do sentence segmentation with existing documents
|
2019-08-31 14:39:06 +03:00
|
|
|
if "\n\n" not in input_data and doc_delimiter in input_data and seg_sents:
|
2019-08-29 13:04:01 +03:00
|
|
|
input_data = segment_sents_and_docs(input_data, 0, "", model=model, msg=msg)
|
|
|
|
# do both sentence segmentation and document segmentation according
|
|
|
|
# to options
|
2019-08-31 14:39:06 +03:00
|
|
|
if "\n\n" not in input_data and doc_delimiter not in input_data:
|
2019-08-29 13:04:01 +03:00
|
|
|
# sentence segmentation required for document segmentation
|
|
|
|
if n_sents > 0 and not seg_sents:
|
2019-08-31 14:39:06 +03:00
|
|
|
msg.warn(
|
|
|
|
"No sentence boundaries found to use with option `-n {}`. "
|
|
|
|
"Use `-s` to automatically segment sentences or `-n 0` "
|
|
|
|
"to disable.".format(n_sents)
|
|
|
|
)
|
2019-08-29 13:04:01 +03:00
|
|
|
else:
|
|
|
|
n_sents_info(msg, n_sents)
|
2019-08-31 14:39:06 +03:00
|
|
|
input_data = segment_sents_and_docs(
|
|
|
|
input_data, n_sents, doc_delimiter, model=model, msg=msg
|
|
|
|
)
|
2019-08-29 13:04:01 +03:00
|
|
|
# provide warnings for problematic data
|
2019-08-31 14:39:06 +03:00
|
|
|
if "\n\n" not in input_data:
|
|
|
|
msg.warn(
|
|
|
|
"No sentence boundaries found. Use `-s` to automatically segment "
|
|
|
|
"sentences."
|
|
|
|
)
|
|
|
|
if doc_delimiter not in input_data:
|
|
|
|
msg.warn(
|
|
|
|
"No document delimiters found. Use `-n` to automatically group "
|
|
|
|
"sentences into documents."
|
|
|
|
)
|
2017-10-10 06:03:26 +03:00
|
|
|
output_docs = []
|
2019-08-29 13:04:01 +03:00
|
|
|
for doc in input_data.strip().split(doc_delimiter):
|
2017-10-10 06:03:26 +03:00
|
|
|
doc = doc.strip()
|
|
|
|
if not doc:
|
|
|
|
continue
|
|
|
|
output_doc = []
|
2018-11-30 22:16:14 +03:00
|
|
|
for sent in doc.split("\n\n"):
|
2017-10-10 06:03:26 +03:00
|
|
|
sent = sent.strip()
|
|
|
|
if not sent:
|
|
|
|
continue
|
2018-11-30 22:16:14 +03:00
|
|
|
lines = [line.strip() for line in sent.split("\n") if line.strip()]
|
2019-08-29 13:04:01 +03:00
|
|
|
cols = list(zip(*[line.split() for line in lines]))
|
|
|
|
if len(cols) < 2:
|
|
|
|
raise ValueError(
|
2019-08-31 14:39:06 +03:00
|
|
|
"The token-per-line NER file is not formatted correctly. "
|
|
|
|
"Try checking whitespace and delimiters. See "
|
|
|
|
"https://spacy.io/api/cli#convert"
|
|
|
|
)
|
2019-08-29 13:04:01 +03:00
|
|
|
words = cols[0]
|
|
|
|
iob_ents = cols[-1]
|
|
|
|
if len(cols) > 2:
|
|
|
|
tags = cols[1]
|
|
|
|
else:
|
|
|
|
tags = ["-"] * len(words)
|
2017-10-10 06:03:26 +03:00
|
|
|
biluo_ents = iob_to_biluo(iob_ents)
|
2018-11-30 22:16:14 +03:00
|
|
|
output_doc.append(
|
|
|
|
{
|
|
|
|
"tokens": [
|
|
|
|
{"orth": w, "tag": tag, "ner": ent}
|
|
|
|
for (w, tag, ent) in zip(words, tags, biluo_ents)
|
|
|
|
]
|
|
|
|
}
|
|
|
|
)
|
|
|
|
output_docs.append(
|
|
|
|
{"id": len(output_docs), "paragraphs": [{"sentences": output_doc}]}
|
|
|
|
)
|
2017-10-10 06:03:26 +03:00
|
|
|
output_doc = []
|
|
|
|
return output_docs
|
2019-08-29 13:04:01 +03:00
|
|
|
|
|
|
|
|
|
|
|
def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None):
|
|
|
|
sentencizer = None
|
|
|
|
if model:
|
|
|
|
nlp = load_model(model)
|
|
|
|
if "parser" in nlp.pipe_names:
|
|
|
|
msg.info("Segmenting sentences with parser from model '{}'.".format(model))
|
|
|
|
sentencizer = nlp.get_pipe("parser")
|
|
|
|
if not sentencizer:
|
2019-08-31 14:39:06 +03:00
|
|
|
msg.info(
|
|
|
|
"Segmenting sentences with sentencizer. (Use `-b model` for "
|
|
|
|
"improved parser-based sentence segmentation.)"
|
|
|
|
)
|
2019-08-29 13:04:01 +03:00
|
|
|
nlp = MultiLanguage()
|
|
|
|
sentencizer = nlp.create_pipe("sentencizer")
|
|
|
|
lines = doc.strip().split("\n")
|
|
|
|
words = [line.strip().split()[0] for line in lines]
|
|
|
|
nlpdoc = Doc(nlp.vocab, words=words)
|
|
|
|
sentencizer(nlpdoc)
|
|
|
|
lines_with_segs = []
|
|
|
|
sent_count = 0
|
|
|
|
for i, token in enumerate(nlpdoc):
|
|
|
|
if token.is_sent_start:
|
|
|
|
if n_sents and sent_count % n_sents == 0:
|
|
|
|
lines_with_segs.append(doc_delimiter)
|
|
|
|
lines_with_segs.append("")
|
|
|
|
sent_count += 1
|
|
|
|
lines_with_segs.append(lines[i])
|
|
|
|
return "\n".join(lines_with_segs)
|
|
|
|
|
|
|
|
|
|
|
|
def segment_docs(input_data, n_sents, doc_delimiter):
|
|
|
|
sent_delimiter = "\n\n"
|
|
|
|
sents = input_data.split(sent_delimiter)
|
2019-08-31 14:39:06 +03:00
|
|
|
docs = [sents[i : i + n_sents] for i in range(0, len(sents), n_sents)]
|
2019-08-29 13:04:01 +03:00
|
|
|
input_data = ""
|
|
|
|
for doc in docs:
|
|
|
|
input_data += sent_delimiter + doc_delimiter
|
|
|
|
input_data += sent_delimiter.join(doc)
|
|
|
|
return input_data
|
|
|
|
|
|
|
|
|
|
|
|
def n_sents_info(msg, n_sents):
|
|
|
|
msg.info("Grouping every {} sentences into a document.".format(n_sents))
|
|
|
|
if n_sents == 1:
|
2019-08-31 14:39:06 +03:00
|
|
|
msg.warn(
|
|
|
|
"To generate better training data, you may want to group "
|
|
|
|
"sentences into documents with `-n 10`."
|
|
|
|
)
|