mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-11 09:00:36 +03:00
Start updating converters
This commit is contained in:
parent
6d5bfd6f6a
commit
d422f30a18
|
@ -1,4 +1,4 @@
|
||||||
from .conllu2json import conllu2json # noqa: F401
|
from .conllu2json import conllu2json # noqa: F401
|
||||||
from .iob2json import iob2json # noqa: F401
|
from .iob2json import iob2docs # noqa: F401
|
||||||
from .conll_ner2json import conll_ner2json # noqa: F401
|
from .conll_ner2json import conll_ner2json # noqa: F401
|
||||||
from .jsonl2json import ner_jsonl2json # noqa: F401
|
from .jsonl2docs import ner_jsonl2json # noqa: F401
|
||||||
|
|
|
@ -3,15 +3,16 @@ from wasabi import Printer
|
||||||
from ...gold import iob_to_biluo
|
from ...gold import iob_to_biluo
|
||||||
from ...lang.xx import MultiLanguage
|
from ...lang.xx import MultiLanguage
|
||||||
from ...tokens.doc import Doc
|
from ...tokens.doc import Doc
|
||||||
|
from ...vocab import Vocab
|
||||||
from ...util import load_model
|
from ...util import load_model
|
||||||
|
|
||||||
|
|
||||||
def conll_ner2json(
|
def conll_ner2doc(
|
||||||
input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs
|
input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Convert files in the CoNLL-2003 NER format and similar
|
Convert files in the CoNLL-2003 NER format and similar
|
||||||
whitespace-separated columns into JSON format for use with train cli.
|
whitespace-separated columns into Doc objects.
|
||||||
|
|
||||||
The first column is the tokens, the final column is the IOB tags. If an
|
The first column is the tokens, the final column is the IOB tags. If an
|
||||||
additional second column is present, the second column is the tags.
|
additional second column is present, the second column is the tags.
|
||||||
|
@ -81,17 +82,25 @@ def conll_ner2json(
|
||||||
"No document delimiters found. Use `-n` to automatically group "
|
"No document delimiters found. Use `-n` to automatically group "
|
||||||
"sentences into documents."
|
"sentences into documents."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if model:
|
||||||
|
nlp = load_model(model)
|
||||||
|
else:
|
||||||
|
nlp = MultiLanguage()
|
||||||
output_docs = []
|
output_docs = []
|
||||||
for doc in input_data.strip().split(doc_delimiter):
|
for conll_doc in input_data.strip().split(doc_delimiter):
|
||||||
doc = doc.strip()
|
conll_doc = conll_doc.strip()
|
||||||
if not doc:
|
if not conll_doc:
|
||||||
continue
|
continue
|
||||||
output_doc = []
|
words = []
|
||||||
for sent in doc.split("\n\n"):
|
sent_starts = []
|
||||||
sent = sent.strip()
|
pos_tags = []
|
||||||
|
biluo_tags = []
|
||||||
|
for conll_sent in conll_doc.split("\n\n"):
|
||||||
|
conll_sent = conll_sent.strip()
|
||||||
if not sent:
|
if not sent:
|
||||||
continue
|
continue
|
||||||
lines = [line.strip() for line in sent.split("\n") if line.strip()]
|
lines = [line.strip() for line in conll_sent.split("\n") if line.strip()]
|
||||||
cols = list(zip(*[line.split() for line in lines]))
|
cols = list(zip(*[line.split() for line in lines]))
|
||||||
if len(cols) < 2:
|
if len(cols) < 2:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
@ -99,25 +108,19 @@ def conll_ner2json(
|
||||||
"Try checking whitespace and delimiters. See "
|
"Try checking whitespace and delimiters. See "
|
||||||
"https://spacy.io/api/cli#convert"
|
"https://spacy.io/api/cli#convert"
|
||||||
)
|
)
|
||||||
words = cols[0]
|
length = len(cols[0])
|
||||||
iob_ents = cols[-1]
|
words.extend(cols[0])
|
||||||
if len(cols) > 2:
|
sent_stats.extend([True] + [False] * (length - 1))
|
||||||
tags = cols[1]
|
biluo_tags.extend(iob_to_biluo(cols[-1]))
|
||||||
else:
|
pos_tags.extend(cols[1] if len(cols) > 2 else ["-"] * length)
|
||||||
tags = ["-"] * len(words)
|
|
||||||
biluo_ents = iob_to_biluo(iob_ents)
|
doc = Doc(nlp.vocab, words=words)
|
||||||
output_doc.append(
|
for i, token in enumerate(doc):
|
||||||
{
|
token.tag_ = pos_tags[i]
|
||||||
"tokens": [
|
token.is_sent_start = sent_starts[i]
|
||||||
{"orth": w, "tag": tag, "ner": ent}
|
entities = tags_to_entities(biluo_tags)
|
||||||
for (w, tag, ent) in zip(words, tags, biluo_ents)
|
doc.ents = [Span(doc, start=s, end=e+1, label=L) for L, s, e in entities]
|
||||||
]
|
output_docs.append(doc)
|
||||||
}
|
|
||||||
)
|
|
||||||
output_docs.append(
|
|
||||||
{"id": len(output_docs), "paragraphs": [{"sentences": output_doc}]}
|
|
||||||
)
|
|
||||||
output_doc = []
|
|
||||||
return output_docs
|
return output_docs
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,14 +1,15 @@
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
|
||||||
from ...gold import iob_to_biluo
|
from ...gold import iob_to_biluo, tags_to_entities
|
||||||
from ...util import minibatch
|
from ...util import minibatch
|
||||||
|
from .util import merge_sentences
|
||||||
from .conll_ner2json import n_sents_info
|
from .conll_ner2json import n_sents_info
|
||||||
|
|
||||||
|
|
||||||
def iob2json(input_data, n_sents=10, no_print=False, *args, **kwargs):
|
def iob2docs(input_data, n_sents=10, no_print=False, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
Convert IOB files with one sentence per line and tags separated with '|'
|
Convert IOB files with one sentence per line and tags separated with '|'
|
||||||
into JSON format for use with train cli. IOB and IOB2 are accepted.
|
into Doc objects so they can be saved. IOB and IOB2 are accepted.
|
||||||
|
|
||||||
Sample formats:
|
Sample formats:
|
||||||
|
|
||||||
|
@ -26,40 +27,25 @@ def iob2json(input_data, n_sents=10, no_print=False, *args, **kwargs):
|
||||||
|
|
||||||
|
|
||||||
def read_iob(raw_sents):
|
def read_iob(raw_sents):
|
||||||
sentences = []
|
docs = []
|
||||||
for line in raw_sents:
|
for line in raw_sents:
|
||||||
if not line.strip():
|
if not line.strip():
|
||||||
continue
|
continue
|
||||||
tokens = [t.split("|") for t in line.split()]
|
tokens = [t.split("|") for t in line.split()]
|
||||||
if len(tokens[0]) == 3:
|
if len(tokens[0]) == 3:
|
||||||
words, pos, iob = zip(*tokens)
|
words, tags, iob = zip(*tokens)
|
||||||
elif len(tokens[0]) == 2:
|
elif len(tokens[0]) == 2:
|
||||||
words, iob = zip(*tokens)
|
words, iob = zip(*tokens)
|
||||||
pos = ["-"] * len(words)
|
tags = ["-"] * len(words)
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert"
|
"The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert"
|
||||||
)
|
)
|
||||||
|
doc = Doc(vocab, words=words)
|
||||||
|
for i, tag in enumerate(pos):
|
||||||
|
doc[i].tag_ = tag
|
||||||
biluo = iob_to_biluo(iob)
|
biluo = iob_to_biluo(iob)
|
||||||
sentences.append(
|
entities = biluo_tags_to_entities(biluo)
|
||||||
[
|
doc.ents = [Span(doc, start=s, end=e, label=L) for (L, s, e) in entities]
|
||||||
{"orth": w, "tag": p, "ner": ent}
|
docs.append(doc)
|
||||||
for (w, p, ent) in zip(words, pos, biluo)
|
|
||||||
]
|
|
||||||
)
|
|
||||||
sentences = [{"tokens": sent} for sent in sentences]
|
|
||||||
paragraphs = [{"sentences": [sent]} for sent in sentences]
|
|
||||||
docs = [{"id": i, "paragraphs": [para]} for i, para in enumerate(paragraphs)]
|
|
||||||
return docs
|
return docs
|
||||||
|
|
||||||
|
|
||||||
def merge_sentences(docs, n_sents):
|
|
||||||
merged = []
|
|
||||||
for group in minibatch(docs, size=n_sents):
|
|
||||||
group = list(group)
|
|
||||||
first = group.pop(0)
|
|
||||||
to_extend = first["paragraphs"][0]["sentences"]
|
|
||||||
for sent in group:
|
|
||||||
to_extend.extend(sent["paragraphs"][0]["sentences"])
|
|
||||||
merged.append(first)
|
|
||||||
return merged
|
|
||||||
|
|
|
@ -4,15 +4,17 @@ from ...gold import docs_to_json
|
||||||
from ...util import get_lang_class, minibatch
|
from ...util import get_lang_class, minibatch
|
||||||
|
|
||||||
|
|
||||||
def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False, **_):
|
def ner_jsonl2docs(input_data, lang=None, n_sents=10, use_morphology=False, **_):
|
||||||
if lang is None:
|
if lang is None:
|
||||||
raise ValueError("No --lang specified, but tokenization required")
|
raise ValueError("No --lang specified, but tokenization required")
|
||||||
json_docs = []
|
docs = []
|
||||||
input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")]
|
input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")]
|
||||||
nlp = get_lang_class(lang)()
|
nlp = get_lang_class(lang)()
|
||||||
sentencizer = nlp.create_pipe("sentencizer")
|
sentencizer = nlp.create_pipe("sentencizer")
|
||||||
for i, batch in enumerate(minibatch(input_examples, size=n_sents)):
|
for i, batch in enumerate(minibatch(input_examples, size=n_sents)):
|
||||||
docs = []
|
docs = []
|
||||||
|
# TODO: Should we be merging these? We're disrespecting the n_sents
|
||||||
|
# currently.
|
||||||
for record in batch:
|
for record in batch:
|
||||||
raw_text = record["text"]
|
raw_text = record["text"]
|
||||||
if "entities" in record:
|
if "entities" in record:
|
||||||
|
@ -25,8 +27,7 @@ def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False, **_)
|
||||||
spans = [doc.char_span(s, e, label=L) for s, e, L in ents]
|
spans = [doc.char_span(s, e, label=L) for s, e, L in ents]
|
||||||
doc.ents = _cleanup_spans(spans)
|
doc.ents = _cleanup_spans(spans)
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
json_docs.append(docs_to_json(docs, id=i))
|
return docs
|
||||||
return json_docs
|
|
||||||
|
|
||||||
|
|
||||||
def _cleanup_spans(spans):
|
def _cleanup_spans(spans):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user