mirror of
https://github.com/explosion/spaCy.git
synced 2025-10-24 12:41:23 +03:00
Fix merge
This commit is contained in:
commit
c2fd1e4eb9
|
@ -62,7 +62,7 @@ def convert_cli(
|
||||||
# We get an instance of the FileTypes from the CLI so we need its string value
|
# We get an instance of the FileTypes from the CLI so we need its string value
|
||||||
file_type = file_type.value
|
file_type = file_type.value
|
||||||
input_path = Path(input_path)
|
input_path = Path(input_path)
|
||||||
output_dir = Path(output_dir) if output_dir != "-" else "-"
|
output_dir = "-" if output_dir == Path("-") else output_dir
|
||||||
cli_args = locals()
|
cli_args = locals()
|
||||||
silent = output_dir == "-"
|
silent = output_dir == "-"
|
||||||
msg = Printer(no_print=silent)
|
msg = Printer(no_print=silent)
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
|
||||||
|
from .conll_ner2docs import n_sents_info
|
||||||
from ...gold import iob_to_biluo, tags_to_entities
|
from ...gold import iob_to_biluo, tags_to_entities
|
||||||
from ...tokens import Doc, Span
|
from ...tokens import Doc, Span
|
||||||
from .util import merge_sentences
|
from ...util import minibatch
|
||||||
from .conll_ner2docs import n_sents_info
|
|
||||||
|
|
||||||
|
|
||||||
def iob2docs(input_data, vocab, n_sents=10, no_print=False, *args, **kwargs):
|
def iob2docs(input_data, vocab, n_sents=10, no_print=False, *args, **kwargs):
|
||||||
|
@ -19,33 +19,46 @@ def iob2docs(input_data, vocab, n_sents=10, no_print=False, *args, **kwargs):
|
||||||
I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
|
I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
|
||||||
"""
|
"""
|
||||||
msg = Printer(no_print=no_print)
|
msg = Printer(no_print=no_print)
|
||||||
docs = read_iob(input_data.split("\n"), vocab)
|
|
||||||
if n_sents > 0:
|
if n_sents > 0:
|
||||||
n_sents_info(msg, n_sents)
|
n_sents_info(msg, n_sents)
|
||||||
docs = merge_sentences(docs, n_sents)
|
docs = read_iob(input_data.split("\n"), vocab, n_sents)
|
||||||
return docs
|
return docs
|
||||||
|
|
||||||
|
|
||||||
def read_iob(raw_sents, vocab):
|
def read_iob(raw_sents, vocab, n_sents):
|
||||||
docs = []
|
docs = []
|
||||||
for line in raw_sents:
|
for group in minibatch(raw_sents, size=n_sents):
|
||||||
|
tokens = []
|
||||||
|
words = []
|
||||||
|
tags = []
|
||||||
|
iob = []
|
||||||
|
sent_starts = []
|
||||||
|
for line in group:
|
||||||
if not line.strip():
|
if not line.strip():
|
||||||
continue
|
continue
|
||||||
tokens = [t.split("|") for t in line.split()]
|
sent_tokens = [t.split("|") for t in line.split()]
|
||||||
if len(tokens[0]) == 3:
|
if len(sent_tokens[0]) == 3:
|
||||||
words, tags, iob = zip(*tokens)
|
sent_words, sent_tags, sent_iob = zip(*sent_tokens)
|
||||||
elif len(tokens[0]) == 2:
|
elif len(sent_tokens[0]) == 2:
|
||||||
words, iob = zip(*tokens)
|
sent_words, sent_iob = zip(*sent_tokens)
|
||||||
tags = ["-"] * len(words)
|
sent_tags = ["-"] * len(sent_words)
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert"
|
"The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert"
|
||||||
)
|
)
|
||||||
|
words.extend(sent_words)
|
||||||
|
tags.extend(sent_tags)
|
||||||
|
iob.extend(sent_iob)
|
||||||
|
tokens.extend(sent_tokens)
|
||||||
|
sent_starts.append(True)
|
||||||
|
sent_starts.extend([False for _ in sent_words[1:]])
|
||||||
doc = Doc(vocab, words=words)
|
doc = Doc(vocab, words=words)
|
||||||
for i, tag in enumerate(tags):
|
for i, tag in enumerate(tags):
|
||||||
doc[i].tag_ = tag
|
doc[i].tag_ = tag
|
||||||
|
for i, sent_start in enumerate(sent_starts):
|
||||||
|
doc[i].is_sent_start = sent_start
|
||||||
biluo = iob_to_biluo(iob)
|
biluo = iob_to_biluo(iob)
|
||||||
entities = tags_to_entities(biluo)
|
entities = tags_to_entities(biluo)
|
||||||
doc.ents = [Span(doc, start=s, end=e, label=L) for (L, s, e) in entities]
|
doc.ents = [Span(doc, start=s, end=e+1, label=L) for (L, s, e) in entities]
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
return docs
|
return docs
|
||||||
|
|
|
@ -1,8 +0,0 @@
|
||||||
from spacy.util import minibatch
|
|
||||||
|
|
||||||
|
|
||||||
def merge_sentences(docs, n_sents):
|
|
||||||
merged = []
|
|
||||||
for group in minibatch(docs, size=n_sents):
|
|
||||||
raise NotImplementedError
|
|
||||||
return merged
|
|
|
@ -7,37 +7,18 @@ from .iob_utils import biluo_tags_from_offsets, tags_to_entities
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
|
||||||
def merge_sents(sents):
|
def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
|
||||||
m_deps = [[], [], [], [], [], []]
|
|
||||||
m_cats = {}
|
|
||||||
m_brackets = []
|
|
||||||
i = 0
|
|
||||||
for (ids, words, tags, heads, labels, ner), (cats, brackets) in sents:
|
|
||||||
m_deps[0].extend(id_ + i for id_ in ids)
|
|
||||||
m_deps[1].extend(words)
|
|
||||||
m_deps[2].extend(tags)
|
|
||||||
m_deps[3].extend(head + i for head in heads)
|
|
||||||
m_deps[4].extend(labels)
|
|
||||||
m_deps[5].extend(ner)
|
|
||||||
m_brackets.extend((b["first"] + i, b["last"] + i, b["label"])
|
|
||||||
for b in brackets)
|
|
||||||
m_cats.update(cats)
|
|
||||||
i += len(ids)
|
|
||||||
return [(m_deps, (m_cats, m_brackets))]
|
|
||||||
|
|
||||||
|
|
||||||
def docs_to_json(docs, id=0, ner_missing_tag="O"):
|
|
||||||
"""Convert a list of Doc objects into the JSON-serializable format used by
|
"""Convert a list of Doc objects into the JSON-serializable format used by
|
||||||
the spacy train command.
|
the spacy train command.
|
||||||
|
|
||||||
docs (iterable / Doc): The Doc object(s) to convert.
|
docs (iterable / Doc): The Doc object(s) to convert.
|
||||||
id (int): Id for the JSON.
|
doc_id (int): Id for the JSON.
|
||||||
RETURNS (dict): The data in spaCy's JSON format
|
RETURNS (dict): The data in spaCy's JSON format
|
||||||
- each input doc will be treated as a paragraph in the output doc
|
- each input doc will be treated as a paragraph in the output doc
|
||||||
"""
|
"""
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
json_doc = {"id": id, "paragraphs": []}
|
json_doc = {"id": doc_id, "paragraphs": []}
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
json_para = {'raw': doc.text, "sentences": [], "cats": [], "entities": [], "links": []}
|
json_para = {'raw': doc.text, "sentences": [], "cats": [], "entities": [], "links": []}
|
||||||
for cat, val in doc.cats.items():
|
for cat, val in doc.cats.items():
|
||||||
|
|
|
@ -172,6 +172,8 @@ def offsets_from_biluo_tags(doc, tags):
|
||||||
|
|
||||||
|
|
||||||
def tags_to_entities(tags):
|
def tags_to_entities(tags):
|
||||||
|
""" Note that the end index returned by this function is inclusive.
|
||||||
|
To use it for Span creation, increment the end by 1."""
|
||||||
entities = []
|
entities = []
|
||||||
start = None
|
start = None
|
||||||
for i, tag in enumerate(tags):
|
for i, tag in enumerate(tags):
|
||||||
|
|
|
@ -39,7 +39,8 @@ def forward(model, X, is_train):
|
||||||
|
|
||||||
def init(model, X=None, Y=None):
|
def init(model, X=None, Y=None):
|
||||||
model.get_ref("tok2vec").initialize(X=X)
|
model.get_ref("tok2vec").initialize(X=X)
|
||||||
lower = model.get_ref("lower").initialize()
|
lower = model.get_ref("lower")
|
||||||
|
lower.initialize()
|
||||||
if model.attrs["has_upper"]:
|
if model.attrs["has_upper"]:
|
||||||
statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
|
statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
|
||||||
model.get_ref("upper").initialize(X=statevecs)
|
model.get_ref("upper").initialize(X=statevecs)
|
||||||
|
|
|
@ -10,7 +10,6 @@ from spacy.cli.pretrain import make_docs
|
||||||
# from spacy.gold.converters import conllu2docs
|
# from spacy.gold.converters import conllu2docs
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_cli_converters_conllu2json():
|
def test_cli_converters_conllu2json():
|
||||||
# from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
|
# from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
|
||||||
lines = [
|
lines = [
|
||||||
|
@ -35,7 +34,6 @@ def test_cli_converters_conllu2json():
|
||||||
assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"]
|
assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"lines",
|
"lines",
|
||||||
[
|
[
|
||||||
|
@ -73,7 +71,6 @@ def test_cli_converters_conllu2json_name_ner_map(lines):
|
||||||
assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"]
|
assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_cli_converters_conllu2json_subtokens():
|
def test_cli_converters_conllu2json_subtokens():
|
||||||
# https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
|
# https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
|
||||||
lines = [
|
lines = [
|
||||||
|
@ -117,7 +114,6 @@ def test_cli_converters_conllu2json_subtokens():
|
||||||
assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"]
|
assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_cli_converters_iob2json(en_vocab):
|
def test_cli_converters_iob2json(en_vocab):
|
||||||
lines = [
|
lines = [
|
||||||
"I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
|
"I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
|
||||||
|
@ -127,22 +123,22 @@ def test_cli_converters_iob2json(en_vocab):
|
||||||
]
|
]
|
||||||
input_data = "\n".join(lines)
|
input_data = "\n".join(lines)
|
||||||
converted_docs = iob2docs(input_data, en_vocab, n_sents=10)
|
converted_docs = iob2docs(input_data, en_vocab, n_sents=10)
|
||||||
|
assert len(converted_docs) == 1
|
||||||
converted = docs_to_json(converted_docs)
|
converted = docs_to_json(converted_docs)
|
||||||
assert len(converted) == 1
|
assert converted["id"] == 0
|
||||||
assert converted[0]["id"] == 0
|
assert len(converted["paragraphs"]) == 1
|
||||||
assert len(converted[0]["paragraphs"]) == 1
|
assert len(converted["paragraphs"][0]["sentences"]) == 4
|
||||||
assert len(converted[0]["paragraphs"][0]["sentences"]) == 4
|
|
||||||
for i in range(0, 4):
|
for i in range(0, 4):
|
||||||
sent = converted[0]["paragraphs"][0]["sentences"][i]
|
sent = converted["paragraphs"][0]["sentences"][i]
|
||||||
assert len(sent["tokens"]) == 8
|
assert len(sent["tokens"]) == 8
|
||||||
tokens = sent["tokens"]
|
tokens = sent["tokens"]
|
||||||
# fmt: off
|
# fmt: off
|
||||||
assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
|
assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
|
||||||
assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"]
|
assert len(converted_docs[0].ents) == 8
|
||||||
# fmt: on
|
for ent in converted_docs[0].ents:
|
||||||
|
assert(ent.text in ["New York City", "London"])
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_cli_converters_conll_ner2json():
|
def test_cli_converters_conll_ner2json():
|
||||||
lines = [
|
lines = [
|
||||||
"-DOCSTART- -X- O O",
|
"-DOCSTART- -X- O O",
|
||||||
|
@ -194,19 +190,21 @@ def test_cli_converters_conll_ner2json():
|
||||||
]
|
]
|
||||||
input_data = "\n".join(lines)
|
input_data = "\n".join(lines)
|
||||||
converted_docs = conll_ner2docs(input_data, n_sents=10)
|
converted_docs = conll_ner2docs(input_data, n_sents=10)
|
||||||
|
assert len(converted_docs) == 1
|
||||||
converted = docs_to_json(converted_docs)
|
converted = docs_to_json(converted_docs)
|
||||||
assert len(converted) == 1
|
assert converted["id"] == 0
|
||||||
assert converted[0]["id"] == 0
|
assert len(converted["paragraphs"]) == 1
|
||||||
assert len(converted[0]["paragraphs"]) == 1
|
assert len(converted["paragraphs"][0]["sentences"]) == 5
|
||||||
assert len(converted[0]["paragraphs"][0]["sentences"]) == 5
|
|
||||||
for i in range(0, 5):
|
for i in range(0, 5):
|
||||||
sent = converted[0]["paragraphs"][0]["sentences"][i]
|
sent = converted["paragraphs"][0]["sentences"][i]
|
||||||
assert len(sent["tokens"]) == 8
|
assert len(sent["tokens"]) == 8
|
||||||
tokens = sent["tokens"]
|
tokens = sent["tokens"]
|
||||||
# fmt: off
|
# fmt: off
|
||||||
assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
|
assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
|
||||||
assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"]
|
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
assert len(converted_docs[0].ents) == 10
|
||||||
|
for ent in converted_docs[0].ents:
|
||||||
|
assert (ent.text in ["New York City", "London"])
|
||||||
|
|
||||||
|
|
||||||
def test_pretrain_make_docs():
|
def test_pretrain_make_docs():
|
||||||
|
|
Loading…
Reference in New Issue
Block a user