mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-06 22:50:33 +03:00
a5ebfb20f5
Move converters under spacy.gold Move things around Fix naming Fix name Update converter to produce DocBin Update converters Make spacy convert output docbin Fix import Fix docbin Fix import Update converter Remove jsonl converter Add json2docs converter
52 lines
1.8 KiB
Python
52 lines
1.8 KiB
Python
from wasabi import Printer
|
|
|
|
from ...gold import iob_to_biluo, tags_to_entities
|
|
from ...util import minibatch
|
|
from .util import merge_sentences
|
|
from .conll_ner2docs import n_sents_info
|
|
|
|
|
|
def iob2docs(input_data, n_sents=10, no_print=False, *args, **kwargs):
|
|
"""
|
|
Convert IOB files with one sentence per line and tags separated with '|'
|
|
into Doc objects so they can be saved. IOB and IOB2 are accepted.
|
|
|
|
Sample formats:
|
|
|
|
I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O
|
|
I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O
|
|
I|PRP|O like|VBP|O London|NNP|I-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
|
|
I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
|
|
"""
|
|
msg = Printer(no_print=no_print)
|
|
docs = read_iob(input_data.split("\n"))
|
|
if n_sents > 0:
|
|
n_sents_info(msg, n_sents)
|
|
docs = merge_sentences(docs, n_sents)
|
|
return docs
|
|
|
|
|
|
def read_iob(raw_sents):
|
|
docs = []
|
|
for line in raw_sents:
|
|
if not line.strip():
|
|
continue
|
|
tokens = [t.split("|") for t in line.split()]
|
|
if len(tokens[0]) == 3:
|
|
words, tags, iob = zip(*tokens)
|
|
elif len(tokens[0]) == 2:
|
|
words, iob = zip(*tokens)
|
|
tags = ["-"] * len(words)
|
|
else:
|
|
raise ValueError(
|
|
"The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert"
|
|
)
|
|
doc = Doc(vocab, words=words)
|
|
for i, tag in enumerate(pos):
|
|
doc[i].tag_ = tag
|
|
biluo = iob_to_biluo(iob)
|
|
entities = biluo_tags_to_entities(biluo)
|
|
doc.ents = [Span(doc, start=s, end=e, label=L) for (L, s, e) in entities]
|
|
docs.append(doc)
|
|
return docs
|