mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-11 09:00:36 +03:00
Start updating converters
This commit is contained in:
parent
6d5bfd6f6a
commit
d422f30a18
|
@ -1,4 +1,4 @@
|
|||
from .conllu2json import conllu2json # noqa: F401
|
||||
from .iob2json import iob2json # noqa: F401
|
||||
from .iob2json import iob2docs # noqa: F401
|
||||
from .conll_ner2json import conll_ner2json # noqa: F401
|
||||
from .jsonl2json import ner_jsonl2json # noqa: F401
|
||||
from .jsonl2docs import ner_jsonl2json # noqa: F401
|
||||
|
|
|
@ -3,15 +3,16 @@ from wasabi import Printer
|
|||
from ...gold import iob_to_biluo
|
||||
from ...lang.xx import MultiLanguage
|
||||
from ...tokens.doc import Doc
|
||||
from ...vocab import Vocab
|
||||
from ...util import load_model
|
||||
|
||||
|
||||
def conll_ner2json(
|
||||
def conll_ner2doc(
|
||||
input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs
|
||||
):
|
||||
"""
|
||||
Convert files in the CoNLL-2003 NER format and similar
|
||||
whitespace-separated columns into JSON format for use with train cli.
|
||||
whitespace-separated columns into Doc objects.
|
||||
|
||||
The first column is the tokens, the final column is the IOB tags. If an
|
||||
additional second column is present, the second column is the tags.
|
||||
|
@ -81,17 +82,25 @@ def conll_ner2json(
|
|||
"No document delimiters found. Use `-n` to automatically group "
|
||||
"sentences into documents."
|
||||
)
|
||||
|
||||
if model:
|
||||
nlp = load_model(model)
|
||||
else:
|
||||
nlp = MultiLanguage()
|
||||
output_docs = []
|
||||
for doc in input_data.strip().split(doc_delimiter):
|
||||
doc = doc.strip()
|
||||
if not doc:
|
||||
for conll_doc in input_data.strip().split(doc_delimiter):
|
||||
conll_doc = conll_doc.strip()
|
||||
if not conll_doc:
|
||||
continue
|
||||
output_doc = []
|
||||
for sent in doc.split("\n\n"):
|
||||
sent = sent.strip()
|
||||
words = []
|
||||
sent_starts = []
|
||||
pos_tags = []
|
||||
biluo_tags = []
|
||||
for conll_sent in conll_doc.split("\n\n"):
|
||||
conll_sent = conll_sent.strip()
|
||||
if not sent:
|
||||
continue
|
||||
lines = [line.strip() for line in sent.split("\n") if line.strip()]
|
||||
lines = [line.strip() for line in conll_sent.split("\n") if line.strip()]
|
||||
cols = list(zip(*[line.split() for line in lines]))
|
||||
if len(cols) < 2:
|
||||
raise ValueError(
|
||||
|
@ -99,25 +108,19 @@ def conll_ner2json(
|
|||
"Try checking whitespace and delimiters. See "
|
||||
"https://spacy.io/api/cli#convert"
|
||||
)
|
||||
words = cols[0]
|
||||
iob_ents = cols[-1]
|
||||
if len(cols) > 2:
|
||||
tags = cols[1]
|
||||
else:
|
||||
tags = ["-"] * len(words)
|
||||
biluo_ents = iob_to_biluo(iob_ents)
|
||||
output_doc.append(
|
||||
{
|
||||
"tokens": [
|
||||
{"orth": w, "tag": tag, "ner": ent}
|
||||
for (w, tag, ent) in zip(words, tags, biluo_ents)
|
||||
]
|
||||
}
|
||||
)
|
||||
output_docs.append(
|
||||
{"id": len(output_docs), "paragraphs": [{"sentences": output_doc}]}
|
||||
)
|
||||
output_doc = []
|
||||
length = len(cols[0])
|
||||
words.extend(cols[0])
|
||||
sent_stats.extend([True] + [False] * (length - 1))
|
||||
biluo_tags.extend(iob_to_biluo(cols[-1]))
|
||||
pos_tags.extend(cols[1] if len(cols) > 2 else ["-"] * length)
|
||||
|
||||
doc = Doc(nlp.vocab, words=words)
|
||||
for i, token in enumerate(doc):
|
||||
token.tag_ = pos_tags[i]
|
||||
token.is_sent_start = sent_starts[i]
|
||||
entities = tags_to_entities(biluo_tags)
|
||||
doc.ents = [Span(doc, start=s, end=e+1, label=L) for L, s, e in entities]
|
||||
output_docs.append(doc)
|
||||
return output_docs
|
||||
|
||||
|
||||
|
|
|
@ -1,14 +1,15 @@
|
|||
from wasabi import Printer
|
||||
|
||||
from ...gold import iob_to_biluo
|
||||
from ...gold import iob_to_biluo, tags_to_entities
|
||||
from ...util import minibatch
|
||||
from .util import merge_sentences
|
||||
from .conll_ner2json import n_sents_info
|
||||
|
||||
|
||||
def iob2json(input_data, n_sents=10, no_print=False, *args, **kwargs):
|
||||
def iob2docs(input_data, n_sents=10, no_print=False, *args, **kwargs):
|
||||
"""
|
||||
Convert IOB files with one sentence per line and tags separated with '|'
|
||||
into JSON format for use with train cli. IOB and IOB2 are accepted.
|
||||
into Doc objects so they can be saved. IOB and IOB2 are accepted.
|
||||
|
||||
Sample formats:
|
||||
|
||||
|
@ -26,40 +27,25 @@ def iob2json(input_data, n_sents=10, no_print=False, *args, **kwargs):
|
|||
|
||||
|
||||
def read_iob(raw_sents):
|
||||
sentences = []
|
||||
docs = []
|
||||
for line in raw_sents:
|
||||
if not line.strip():
|
||||
continue
|
||||
tokens = [t.split("|") for t in line.split()]
|
||||
if len(tokens[0]) == 3:
|
||||
words, pos, iob = zip(*tokens)
|
||||
words, tags, iob = zip(*tokens)
|
||||
elif len(tokens[0]) == 2:
|
||||
words, iob = zip(*tokens)
|
||||
pos = ["-"] * len(words)
|
||||
tags = ["-"] * len(words)
|
||||
else:
|
||||
raise ValueError(
|
||||
"The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert"
|
||||
)
|
||||
doc = Doc(vocab, words=words)
|
||||
for i, tag in enumerate(pos):
|
||||
doc[i].tag_ = tag
|
||||
biluo = iob_to_biluo(iob)
|
||||
sentences.append(
|
||||
[
|
||||
{"orth": w, "tag": p, "ner": ent}
|
||||
for (w, p, ent) in zip(words, pos, biluo)
|
||||
]
|
||||
)
|
||||
sentences = [{"tokens": sent} for sent in sentences]
|
||||
paragraphs = [{"sentences": [sent]} for sent in sentences]
|
||||
docs = [{"id": i, "paragraphs": [para]} for i, para in enumerate(paragraphs)]
|
||||
entities = biluo_tags_to_entities(biluo)
|
||||
doc.ents = [Span(doc, start=s, end=e, label=L) for (L, s, e) in entities]
|
||||
docs.append(doc)
|
||||
return docs
|
||||
|
||||
|
||||
def merge_sentences(docs, n_sents):
|
||||
merged = []
|
||||
for group in minibatch(docs, size=n_sents):
|
||||
group = list(group)
|
||||
first = group.pop(0)
|
||||
to_extend = first["paragraphs"][0]["sentences"]
|
||||
for sent in group:
|
||||
to_extend.extend(sent["paragraphs"][0]["sentences"])
|
||||
merged.append(first)
|
||||
return merged
|
||||
|
|
|
@ -4,15 +4,17 @@ from ...gold import docs_to_json
|
|||
from ...util import get_lang_class, minibatch
|
||||
|
||||
|
||||
def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False, **_):
|
||||
def ner_jsonl2docs(input_data, lang=None, n_sents=10, use_morphology=False, **_):
|
||||
if lang is None:
|
||||
raise ValueError("No --lang specified, but tokenization required")
|
||||
json_docs = []
|
||||
docs = []
|
||||
input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")]
|
||||
nlp = get_lang_class(lang)()
|
||||
sentencizer = nlp.create_pipe("sentencizer")
|
||||
for i, batch in enumerate(minibatch(input_examples, size=n_sents)):
|
||||
docs = []
|
||||
# TODO: Should we be merging these? We're disrespecting the n_sents
|
||||
# currently.
|
||||
for record in batch:
|
||||
raw_text = record["text"]
|
||||
if "entities" in record:
|
||||
|
@ -25,8 +27,7 @@ def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False, **_)
|
|||
spans = [doc.char_span(s, e, label=L) for s, e, L in ents]
|
||||
doc.ents = _cleanup_spans(spans)
|
||||
docs.append(doc)
|
||||
json_docs.append(docs_to_json(docs, id=i))
|
||||
return json_docs
|
||||
return docs
|
||||
|
||||
|
||||
def _cleanup_spans(spans):
|
||||
|
|
Loading…
Reference in New Issue
Block a user