spaCy/spacy/gold/converters/iob2docs.py
Matthew Honnibal a5ebfb20f5 Serialize all attrs by default
Move converters under spacy.gold

Move things around

Fix naming

Fix name

Update converter to produce DocBin

Update converters

Make spacy convert output docbin

Fix import

Fix docbin

Fix import

Update converter

Remove jsonl converter

Add json2docs converter
2020-06-22 00:46:08 +02:00

52 lines
1.8 KiB
Python

from wasabi import Printer
from ...gold import iob_to_biluo, tags_to_entities
from ...util import minibatch
from .util import merge_sentences
from .conll_ner2docs import n_sents_info
def iob2docs(input_data, n_sents=10, no_print=False, *args, **kwargs):
"""
Convert IOB files with one sentence per line and tags separated with '|'
into Doc objects so they can be saved. IOB and IOB2 are accepted.
Sample formats:
I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O
I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O
I|PRP|O like|VBP|O London|NNP|I-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
"""
msg = Printer(no_print=no_print)
docs = read_iob(input_data.split("\n"))
if n_sents > 0:
n_sents_info(msg, n_sents)
docs = merge_sentences(docs, n_sents)
return docs
def read_iob(raw_sents):
docs = []
for line in raw_sents:
if not line.strip():
continue
tokens = [t.split("|") for t in line.split()]
if len(tokens[0]) == 3:
words, tags, iob = zip(*tokens)
elif len(tokens[0]) == 2:
words, iob = zip(*tokens)
tags = ["-"] * len(words)
else:
raise ValueError(
"The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert"
)
doc = Doc(vocab, words=words)
for i, tag in enumerate(pos):
doc[i].tag_ = tag
biluo = iob_to_biluo(iob)
entities = biluo_tags_to_entities(biluo)
doc.ents = [Span(doc, start=s, end=e, label=L) for (L, s, e) in entities]
docs.append(doc)
return docs