From 0962ee7ce6291d63d05a3269cd9c7b3443ee291a Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 23 Jun 2020 13:49:21 +0200 Subject: [PATCH 1/6] fix output_dir (converted to Path by typer) --- spacy/cli/convert.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index 3957fd27a..c946b55d4 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -62,7 +62,7 @@ def convert_cli( # We get an instance of the FileTypes from the CLI so we need its string value file_type = file_type.value input_path = Path(input_path) - output_dir = Path(output_dir) if output_dir != "-" else "-" + output_dir = "-" if output_dir == Path("-") else output_dir cli_args = locals() silent = output_dir == "-" msg = Printer(no_print=silent) @@ -124,7 +124,7 @@ def convert( _print_docs_to_stdout(docs, file_type) else: subpath = input_loc.relative_to(input_path) - output_file = Path(output_dir) / subpath.with_suffix(f".{file_type}") + output_file = output_dir / subpath.with_suffix(f".{file_type}") _write_docs_to_file(docs, output_file, file_type) msg.good(f"Generated output file ({len(docs)} documents): {output_file}") From 790b37390136bab87126c3eb69fb9354aef5c7c5 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 23 Jun 2020 14:05:00 +0200 Subject: [PATCH 2/6] fix var --- spacy/ml/tb_framework.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index 21779ddaa..88f27f0bf 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -39,7 +39,8 @@ def forward(model, X, is_train): def init(model, X=None, Y=None): model.get_ref("tok2vec").initialize(X=X) - model.get_ref("lower").initialize() + lower = model.get_ref("lower") + lower.initialize() if model.attrs["has_upper"]: statevecs = model.ops.alloc2f(2, lower.get_dim("nO")) model.get_ref("upper").initialize(X=statevecs) From 5cf3eeee0d4cd3906c94dacb1382cf4c0c149f89 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 23 Jun 2020 14:49:31 +0200 Subject: [PATCH 3/6] bugfix: update states after creating golds to avoid out of bounds indexing --- spacy/syntax/arc_eager.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 28787f97d..b57c4f312 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -615,8 +615,8 @@ cdef class ArcEager(TransitionSystem): def init_gold_batch(self, examples): states = self.init_batch([eg.predicted for eg in examples]) keeps = [i for i, s in enumerate(states) if not s.is_final()] - states = [states[i] for i in keeps] golds = [ArcEagerGold(self, states[i], examples[i]) for i in keeps] + states = [states[i] for i in keeps] for gold in golds: self._replace_unseen_labels(gold) n_steps = sum([len(s.queue) * 4 for s in states]) From 351ab3a3d4fbe9f6fb1c8780b63b14c9b0da1ce1 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 23 Jun 2020 16:47:30 +0200 Subject: [PATCH 4/6] pull merge_sent into iob2docs to avoid Doc creation for each line --- spacy/gold/converters/iob2docs.py | 49 +++++++++++++++++++------------ spacy/gold/converters/util.py | 8 ----- spacy/gold/gold_io.pyx | 25 ++-------------- spacy/tests/test_cli.py | 15 +++++----- 4 files changed, 41 insertions(+), 56 deletions(-) delete mode 100644 spacy/gold/converters/util.py diff --git a/spacy/gold/converters/iob2docs.py b/spacy/gold/converters/iob2docs.py index aba23e1b3..27876ba7a 100644 --- a/spacy/gold/converters/iob2docs.py +++ b/spacy/gold/converters/iob2docs.py @@ -1,9 +1,9 @@ from wasabi import Printer +from .conll_ner2docs import n_sents_info from ...gold import iob_to_biluo, tags_to_entities from ...tokens import Doc, Span -from .util import merge_sentences -from .conll_ner2docs import n_sents_info +from ...util import minibatch def iob2docs(input_data, vocab, n_sents=10, no_print=False, *args, **kwargs): @@ -19,31 +19,44 @@ def iob2docs(input_data, vocab, n_sents=10, no_print=False, *args, **kwargs): I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O """ msg = Printer(no_print=no_print) - docs = read_iob(input_data.split("\n"), vocab) if n_sents > 0: n_sents_info(msg, n_sents) - docs = merge_sentences(docs, n_sents) + docs = read_iob(input_data.split("\n"), vocab, n_sents) return docs -def read_iob(raw_sents, vocab): +def read_iob(raw_sents, vocab, n_sents): docs = [] - for line in raw_sents: - if not line.strip(): - continue - tokens = [t.split("|") for t in line.split()] - if len(tokens[0]) == 3: - words, tags, iob = zip(*tokens) - elif len(tokens[0]) == 2: - words, iob = zip(*tokens) - tags = ["-"] * len(words) - else: - raise ValueError( - "The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert" - ) + for group in minibatch(raw_sents, size=n_sents): + tokens = [] + words = [] + tags = [] + iob = [] + sent_starts = [] + for line in group: + if not line.strip(): + continue + sent_tokens = [t.split("|") for t in line.split()] + if len(sent_tokens[0]) == 3: + sent_words, sent_tags, sent_iob = zip(*sent_tokens) + elif len(sent_tokens[0]) == 2: + sent_words, sent_iob = zip(*sent_tokens) + sent_tags = ["-"] * len(sent_words) + else: + raise ValueError( + "The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert" + ) + words.extend(sent_words) + tags.extend(sent_tags) + iob.extend(sent_iob) + tokens.extend(sent_tokens) + sent_starts.append(True) + sent_starts.extend([False for _ in sent_words[1:]]) doc = Doc(vocab, words=words) for i, tag in enumerate(tags): doc[i].tag_ = tag + for i, sent_start in enumerate(sent_starts): + doc[i].is_sent_start = sent_start biluo = iob_to_biluo(iob) entities = tags_to_entities(biluo) doc.ents = [Span(doc, start=s, end=e, label=L) for (L, s, e) in entities] diff --git a/spacy/gold/converters/util.py b/spacy/gold/converters/util.py deleted file mode 100644 index 41b3e6d24..000000000 --- a/spacy/gold/converters/util.py +++ /dev/null @@ -1,8 +0,0 @@ -from spacy.util import minibatch - - -def merge_sentences(docs, n_sents): - merged = [] - for group in minibatch(docs, size=n_sents): - raise NotImplementedError - return merged diff --git a/spacy/gold/gold_io.pyx b/spacy/gold/gold_io.pyx index 10ca427ed..61ffb2cfb 100644 --- a/spacy/gold/gold_io.pyx +++ b/spacy/gold/gold_io.pyx @@ -6,37 +6,18 @@ from ..tokens import Doc from .iob_utils import biluo_tags_from_offsets -def merge_sents(sents): - m_deps = [[], [], [], [], [], []] - m_cats = {} - m_brackets = [] - i = 0 - for (ids, words, tags, heads, labels, ner), (cats, brackets) in sents: - m_deps[0].extend(id_ + i for id_ in ids) - m_deps[1].extend(words) - m_deps[2].extend(tags) - m_deps[3].extend(head + i for head in heads) - m_deps[4].extend(labels) - m_deps[5].extend(ner) - m_brackets.extend((b["first"] + i, b["last"] + i, b["label"]) - for b in brackets) - m_cats.update(cats) - i += len(ids) - return [(m_deps, (m_cats, m_brackets))] - - -def docs_to_json(docs, id=0, ner_missing_tag="O"): +def docs_to_json(docs, doc_id=0, ner_missing_tag="O"): """Convert a list of Doc objects into the JSON-serializable format used by the spacy train command. docs (iterable / Doc): The Doc object(s) to convert. - id (int): Id for the JSON. + doc_id (int): Id for the JSON. RETURNS (dict): The data in spaCy's JSON format - each input doc will be treated as a paragraph in the output doc """ if isinstance(docs, Doc): docs = [docs] - json_doc = {"id": id, "paragraphs": []} + json_doc = {"id": doc_id, "paragraphs": []} for i, doc in enumerate(docs): json_para = {'raw': doc.text, "sentences": [], "cats": [], "entities": [], "links": []} for cat, val in doc.cats.items(): diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 164961a5b..0cf070b61 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -117,7 +117,6 @@ def test_cli_converters_conllu2json_subtokens(): assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"] -@pytest.mark.xfail def test_cli_converters_iob2json(en_vocab): lines = [ "I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O", @@ -127,19 +126,19 @@ def test_cli_converters_iob2json(en_vocab): ] input_data = "\n".join(lines) converted_docs = iob2docs(input_data, en_vocab, n_sents=10) + assert len(converted_docs) == 1 converted = docs_to_json(converted_docs) - assert len(converted) == 1 - assert converted[0]["id"] == 0 - assert len(converted[0]["paragraphs"]) == 1 - assert len(converted[0]["paragraphs"][0]["sentences"]) == 4 + assert converted["id"] == 0 + assert len(converted["paragraphs"]) == 1 + assert len(converted["paragraphs"][0]["sentences"]) == 4 for i in range(0, 4): - sent = converted[0]["paragraphs"][0]["sentences"][i] + sent = converted["paragraphs"][0]["sentences"][i] assert len(sent["tokens"]) == 8 tokens = sent["tokens"] # fmt: off assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."] - assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"] - # fmt: on + + assert converted["paragraphs"][0]["entities"] == [(18, 26, 'GPE'), (52, 60, 'GPE'), (86, 94, 'GPE'), (120, 128, 'GPE')] @pytest.mark.xfail From 7c76a2b796a3dfc8f51cb6dd34b278d32441d2f4 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 23 Jun 2020 17:09:37 +0200 Subject: [PATCH 5/6] fix asserts --- spacy/tests/test_cli.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 0cf070b61..26d7ebd93 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -117,6 +117,7 @@ def test_cli_converters_conllu2json_subtokens(): assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"] +@pytest.mark.xfail def test_cli_converters_iob2json(en_vocab): lines = [ "I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O", @@ -137,6 +138,9 @@ def test_cli_converters_iob2json(en_vocab): tokens = sent["tokens"] # fmt: off assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."] + assert len(converted_docs[0].ents) == 8 + for ent in converted_docs[0].ents: + assert(ent.text in ["New York City", "London"]) assert converted["paragraphs"][0]["entities"] == [(18, 26, 'GPE'), (52, 60, 'GPE'), (86, 94, 'GPE'), (120, 128, 'GPE')] From 28ad71c1879586facbf77f7d0f68bc58a256171c Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 23 Jun 2020 17:20:41 +0200 Subject: [PATCH 6/6] bugfix excl Span.end in iob2docs --- spacy/gold/converters/iob2docs.py | 2 +- spacy/gold/iob_utils.py | 2 ++ spacy/tests/test_cli.py | 21 ++++++++------------- 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/spacy/gold/converters/iob2docs.py b/spacy/gold/converters/iob2docs.py index 27876ba7a..51321a470 100644 --- a/spacy/gold/converters/iob2docs.py +++ b/spacy/gold/converters/iob2docs.py @@ -59,6 +59,6 @@ def read_iob(raw_sents, vocab, n_sents): doc[i].is_sent_start = sent_start biluo = iob_to_biluo(iob) entities = tags_to_entities(biluo) - doc.ents = [Span(doc, start=s, end=e, label=L) for (L, s, e) in entities] + doc.ents = [Span(doc, start=s, end=e+1, label=L) for (L, s, e) in entities] docs.append(doc) return docs diff --git a/spacy/gold/iob_utils.py b/spacy/gold/iob_utils.py index 3ae911418..b3d605296 100644 --- a/spacy/gold/iob_utils.py +++ b/spacy/gold/iob_utils.py @@ -172,6 +172,8 @@ def offsets_from_biluo_tags(doc, tags): def tags_to_entities(tags): + """ Note that the end index returned by this function is inclusive. + To use it for Span creation, increment the end by 1.""" entities = [] start = None for i, tag in enumerate(tags): diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 26d7ebd93..ca0f3710f 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -10,7 +10,6 @@ from spacy.cli.pretrain import make_docs # from spacy.gold.converters import conllu2docs -@pytest.mark.xfail def test_cli_converters_conllu2json(): # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu lines = [ @@ -35,7 +34,6 @@ def test_cli_converters_conllu2json(): assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"] -@pytest.mark.xfail @pytest.mark.parametrize( "lines", [ @@ -73,7 +71,6 @@ def test_cli_converters_conllu2json_name_ner_map(lines): assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"] -@pytest.mark.xfail def test_cli_converters_conllu2json_subtokens(): # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu lines = [ @@ -117,7 +114,6 @@ def test_cli_converters_conllu2json_subtokens(): assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"] -@pytest.mark.xfail def test_cli_converters_iob2json(en_vocab): lines = [ "I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O", @@ -142,10 +138,7 @@ def test_cli_converters_iob2json(en_vocab): for ent in converted_docs[0].ents: assert(ent.text in ["New York City", "London"]) - assert converted["paragraphs"][0]["entities"] == [(18, 26, 'GPE'), (52, 60, 'GPE'), (86, 94, 'GPE'), (120, 128, 'GPE')] - -@pytest.mark.xfail def test_cli_converters_conll_ner2json(): lines = [ "-DOCSTART- -X- O O", @@ -197,19 +190,21 @@ def test_cli_converters_conll_ner2json(): ] input_data = "\n".join(lines) converted_docs = conll_ner2docs(input_data, n_sents=10) + assert len(converted_docs) == 1 converted = docs_to_json(converted_docs) - assert len(converted) == 1 - assert converted[0]["id"] == 0 - assert len(converted[0]["paragraphs"]) == 1 - assert len(converted[0]["paragraphs"][0]["sentences"]) == 5 + assert converted["id"] == 0 + assert len(converted["paragraphs"]) == 1 + assert len(converted["paragraphs"][0]["sentences"]) == 5 for i in range(0, 5): - sent = converted[0]["paragraphs"][0]["sentences"][i] + sent = converted["paragraphs"][0]["sentences"][i] assert len(sent["tokens"]) == 8 tokens = sent["tokens"] # fmt: off assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."] - assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"] # fmt: on + assert len(converted_docs[0].ents) == 10 + for ent in converted_docs[0].ents: + assert (ent.text in ["New York City", "London"]) def test_pretrain_make_docs():