diff --git a/spacy/tests/test_visualization.py b/spacy/tests/test_visualization.py index 7ce37e894..7d0da397a 100644 --- a/spacy/tests/test_visualization.py +++ b/spacy/tests/test_visualization.py @@ -1,6 +1,6 @@ import pytest from wasabi.util import supports_ansi -from spacy.visualization import AttributeFormat, Visualizer +from spacy.visualization import AttributeFormat, render_dep_tree, render_table from spacy.tokens import Span, Doc, Token @@ -45,7 +45,7 @@ def test_viz_dep_tree_basic(en_vocab): heads=[2, 2, 3, None, 6, 6, 3, 3, 3], deps=["dep"] * 9, ) - dep_tree = Visualizer.render_dep_tree(doc[0 : len(doc)], True) + dep_tree = render_dep_tree(doc[0 : len(doc)], True) assert dep_tree == [ "<╗ ", "<╣ ", @@ -57,7 +57,7 @@ def test_viz_dep_tree_basic(en_vocab): "<══╣", "<══╝", ] - dep_tree = Visualizer.render_dep_tree(doc[0 : len(doc)], False) + dep_tree = render_dep_tree(doc[0 : len(doc)], False) assert dep_tree == [ " ╔>", " ╠>", @@ -92,7 +92,7 @@ def test_viz_dep_tree_non_initial_sent(en_vocab): heads=[0, None, 0, 5, 5, 6, None, 9, 9, 6, 6, 6], deps=["dep"] * 12, ) - dep_tree = Visualizer.render_dep_tree(doc[3 : len(doc)], True) + dep_tree = render_dep_tree(doc[3 : len(doc)], True) assert dep_tree == [ "<╗ ", "<╣ ", @@ -104,7 +104,7 @@ def test_viz_dep_tree_non_initial_sent(en_vocab): "<══╣", "<══╝", ] - dep_tree = Visualizer.render_dep_tree(doc[3 : len(doc)], False) + dep_tree = render_dep_tree(doc[3 : len(doc)], False) assert dep_tree == [ " ╔>", " ╠>", @@ -120,7 +120,7 @@ def test_viz_dep_tree_non_initial_sent(en_vocab): def test_viz_dep_tree_non_projective(horse_doc): """Test dependency tree display with a non-projective dependency.""" - dep_tree = Visualizer.render_dep_tree(horse_doc[0 : len(horse_doc)], True) + dep_tree = render_dep_tree(horse_doc[0 : len(horse_doc)], True) assert dep_tree == [ "<╗ ", "═╩═══╗", @@ -132,7 +132,7 @@ def test_viz_dep_tree_non_projective(horse_doc): "═╝<╝ ║", "<════╝", ] - dep_tree = Visualizer.render_dep_tree(horse_doc[0 : len(horse_doc)], False) + dep_tree = render_dep_tree(horse_doc[0 : len(horse_doc)], False) assert dep_tree == [ " ╔>", "╔═══╩═", @@ -163,7 +163,7 @@ def test_viz_dep_tree_highly_nonprojective(pl_vocab): heads=[5, 5, 0, 5, 5, None, 4, 5], deps=["dep"] * 8, ) - dep_tree = Visualizer.render_dep_tree(doc[0 : len(doc)], True) + dep_tree = render_dep_tree(doc[0 : len(doc)], True) assert dep_tree == [ "═╗<╗", " ║<╣", @@ -174,7 +174,7 @@ def test_viz_dep_tree_highly_nonprojective(pl_vocab): "<╝ ║", "<══╝", ] - dep_tree = Visualizer.render_dep_tree(doc[0 : len(doc)], False) + dep_tree = render_dep_tree(doc[0 : len(doc)], False) assert dep_tree == [ "╔>╔═", "╠>║ ", @@ -190,7 +190,7 @@ def test_viz_dep_tree_highly_nonprojective(pl_vocab): def test_viz_dep_tree_input_not_span(horse_doc): """Test dependency tree display behaviour when the input is not a Span.""" with pytest.raises(ValueError): - Visualizer.render_dep_tree(horse_doc[1:3], True) + render_dep_tree(horse_doc[1:3], True) def test_viz_render_native_attributes(horse_doc): @@ -199,7 +199,10 @@ def test_viz_render_native_attributes(horse_doc): assert AttributeFormat("dep_").render(horse_doc[2]) == "dep" with pytest.raises(AttributeError): AttributeFormat("depp").render(horse_doc[2]) - + with pytest.raises(AttributeError): + AttributeFormat("tree_left").render(horse_doc[2]) + with pytest.raises(AttributeError): + AttributeFormat("tree_right").render(horse_doc[2]) def test_viz_render_colors(horse_doc): assert ( @@ -265,7 +268,7 @@ def test_viz_minimal_render_table_one_sentence( AttributeFormat("ent_type_"), ] assert ( - Visualizer().render(fully_featured_doc_one_sentence, formats, spacing=3).strip() + render_table(fully_featured_doc_one_sentence, formats, spacing=3).strip() == """ ╔>╔═ poss Sarah sarah PROPN NNP NounType=prop|Number=sing PERSON ║ ╚> case 's 's PART POS Poss=yes @@ -295,7 +298,7 @@ def test_viz_minimal_render_table_empty_text( AttributeFormat("morph"), AttributeFormat("ent_type_"), ] - assert Visualizer().render(Doc(en_vocab), formats, spacing=3).strip() == "" + assert render_table(Doc(en_vocab), formats, spacing=3).strip() == "" # headers formats = [ @@ -308,7 +311,7 @@ def test_viz_minimal_render_table_empty_text( AttributeFormat("morph"), AttributeFormat("ent_type_", name="ent"), ] - assert Visualizer().render(Doc(en_vocab), formats, spacing=3).strip() == "" + assert render_table(Doc(en_vocab), formats, spacing=3).strip() == "" def test_viz_minimal_render_table_spacing( @@ -325,7 +328,7 @@ def test_viz_minimal_render_table_spacing( AttributeFormat("ent_type_"), ] assert ( - Visualizer().render(fully_featured_doc_one_sentence, formats, spacing=1).strip() + render_table(fully_featured_doc_one_sentence, formats, spacing=1).strip() == """ ╔>╔═ poss Sarah sarah PROPN NNP NounType=prop|Number=sing PERSON ║ ╚> case 's 's PART POS Poss=yes @@ -356,8 +359,7 @@ def test_viz_minimal_render_table_two_sentences( ] assert ( - Visualizer() - .render(fully_featured_doc_two_sentences, formats, spacing=3) + render_table(fully_featured_doc_two_sentences, formats, spacing=3) .strip() == """ ╔>╔═ poss Sarah sarah PROPN NNP NounType=prop|Number=sing PERSON @@ -401,7 +403,7 @@ def test_viz_rich_render_table_one_sentence( ), ] assert ( - Visualizer().render(fully_featured_doc_one_sentence, formats, spacing=3) + render_table(fully_featured_doc_one_sentence, formats, spacing=3) == "\n\x1b[38;5;2m tree\x1b[0m \x1b[38;5;2mdep \x1b[0m index text lemma \x1b[38;5;100mpos \x1b[0m \x1b[38;5;100mtag\x1b[0m \x1b[38;5;100mmorph \x1b[0m \x1b[38;5;196ment \x1b[0m\n\x1b[38;5;2m------\x1b[0m \x1b[38;5;2m--------\x1b[0m ----- ------- ------- \x1b[38;5;100m-----\x1b[0m \x1b[38;5;100m---\x1b[0m \x1b[38;5;100m---------------\x1b[0m \x1b[38;5;196m------\x1b[0m\n\x1b[38;5;2m ╔>╔═\x1b[0m \x1b[38;5;2mposs \x1b[0m 0 Sarah sarah \x1b[38;5;100mPROPN\x1b[0m \x1b[38;5;100mNNP\x1b[0m \x1b[38;5;100mNounType=prop|N\x1b[0m \x1b[38;5;196m\x1b[38;5;50;48;5;12mPERSON\x1b[0m\x1b[0m\n\x1b[38;5;2m ║ ╚>\x1b[0m \x1b[38;5;2mcase \x1b[0m 1 's 's \x1b[38;5;100mPART \x1b[0m \x1b[38;5;100mPOS\x1b[0m \x1b[38;5;100mPoss=yes \x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m╔>╚═══\x1b[0m \x1b[38;5;2mnsubj \x1b[0m 2 sister sister \x1b[38;5;100mNOUN \x1b[0m \x1b[38;5;100mNN \x1b[0m \x1b[38;5;100mNumber=sing \x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m╠═════\x1b[0m \x1b[38;5;2mROOT \x1b[0m 3 flew fly \x1b[38;5;100mVERB \x1b[0m \x1b[38;5;100mVBD\x1b[0m \x1b[38;5;100mTense=past|Verb\x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m╠>╔═══\x1b[0m \x1b[38;5;2mprep \x1b[0m 4 to to \x1b[38;5;100mADP \x1b[0m \x1b[38;5;100mIN \x1b[0m \x1b[38;5;100m \x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m║ ║ ╔>\x1b[0m \x1b[38;5;2mcompound\x1b[0m 5 Silicon silicon \x1b[38;5;100mPROPN\x1b[0m \x1b[38;5;100mNNP\x1b[0m \x1b[38;5;100mNounType=prop|N\x1b[0m \x1b[38;5;196mGPE \x1b[0m\n\x1b[38;5;2m║ ╚>╚═\x1b[0m \x1b[38;5;2mpobj \x1b[0m 6 Valley valley \x1b[38;5;100mPROPN\x1b[0m \x1b[38;5;100mNNP\x1b[0m \x1b[38;5;100mNounType=prop|N\x1b[0m \x1b[38;5;196mGPE \x1b[0m\n\x1b[38;5;2m╠══>╔═\x1b[0m \x1b[38;5;2mprep \x1b[0m 7 via via \x1b[38;5;100mADP \x1b[0m \x1b[38;5;100mIN \x1b[0m \x1b[38;5;100m \x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m║ ╚>\x1b[0m \x1b[38;5;2mpobj \x1b[0m 8 London london \x1b[38;5;100mPROPN\x1b[0m \x1b[38;5;100mNNP\x1b[0m \x1b[38;5;100mNounType=prop|N\x1b[0m \x1b[38;5;196mGPE \x1b[0m\n\x1b[38;5;2m╚════>\x1b[0m \x1b[38;5;2mpunct \x1b[0m 9 . . \x1b[38;5;100mPUNCT\x1b[0m \x1b[38;5;100m. \x1b[0m \x1b[38;5;100mPunctType=peri \x1b[0m \x1b[38;5;196m \x1b[0m\n\n" if SUPPORTS_ANSI else "\n\x1b[38;5;2m tree\x1b[0m \x1b[38;5;2mdep \x1b[0m index text lemma pos tag morph ent \n\x1b[38;5;2m------\x1b[0m \x1b[38;5;2m--------\x1b[0m ----- ------- ------- ----- --- --------------- ------\n\x1b[38;5;2m ╔>╔═\x1b[0m \x1b[38;5;2mposs \x1b[0m 0 Sarah sarah PROPN NNP NounType=prop|N PERSON\n\x1b[38;5;2m ║ ╚>\x1b[0m \x1b[38;5;2mcase \x1b[0m 1 's 's PART POS Poss=yes \n\x1b[38;5;2m╔>╚═══\x1b[0m \x1b[38;5;2mnsubj \x1b[0m 2 sister sister NOUN NN Number=sing \n\x1b[38;5;2m╠═════\x1b[0m \x1b[38;5;2mROOT \x1b[0m 3 flew fly VERB VBD Tense=past|Verb \n\x1b[38;5;2m╠>╔═══\x1b[0m \x1b[38;5;2mprep \x1b[0m 4 to to ADP IN \n\x1b[38;5;2m║ ║ ╔>\x1b[0m \x1b[38;5;2mcompound\x1b[0m 5 Silicon silicon PROPN NNP NounType=prop|N GPE \n\x1b[38;5;2m║ ╚>╚═\x1b[0m \x1b[38;5;2mpobj \x1b[0m 6 Valley valley PROPN NNP NounType=prop|N GPE \n\x1b[38;5;2m╠══>╔═\x1b[0m \x1b[38;5;2mprep \x1b[0m 7 via via ADP IN \n\x1b[38;5;2m║ ╚>\x1b[0m \x1b[38;5;2mpobj \x1b[0m 8 London london PROPN NNP NounType=prop|N GPE \n\x1b[38;5;2m╚════>\x1b[0m \x1b[38;5;2mpunct \x1b[0m 9 . . PUNCT . PunctType=peri \n\n" @@ -429,7 +431,7 @@ def test_viz_rich_render_table_one_sentence( ), ] assert ( - Visualizer().render(fully_featured_doc_one_sentence, formats, spacing=3) + render_table(fully_featured_doc_one_sentence, formats, spacing=3) == "\n\x1b[38;5;2m tree\x1b[0m \x1b[38;5;2mdep \x1b[0m index \x1b[38;5;196mtext \x1b[0m lemma \x1b[38;5;100mpos \x1b[0m \x1b[38;5;100mtag\x1b[0m \x1b[38;5;100mmorph \x1b[0m ent \n\x1b[38;5;2m------\x1b[0m \x1b[38;5;2m--------\x1b[0m ----- \x1b[38;5;196m-------\x1b[0m ------- \x1b[38;5;100m-----\x1b[0m \x1b[38;5;100m---\x1b[0m \x1b[38;5;100m---------------\x1b[0m ------\n\x1b[38;5;2m ╔>╔═\x1b[0m \x1b[38;5;2mposs \x1b[0m 0 \x1b[38;5;196mSarah \x1b[0m sarah \x1b[38;5;100mPROPN\x1b[0m \x1b[38;5;100mNNP\x1b[0m \x1b[38;5;100mNounType=prop|N\x1b[0m PERSON\n\x1b[38;5;2m ║ ╚>\x1b[0m \x1b[38;5;2mcase \x1b[0m 1 \x1b[38;5;196m\x1b[38;5;50;48;5;12m's\x1b[0m \x1b[0m 's \x1b[38;5;100mPART \x1b[0m \x1b[38;5;100mPOS\x1b[0m \x1b[38;5;100mPoss=yes \x1b[0m \n\x1b[38;5;2m╔>╚═══\x1b[0m \x1b[38;5;2mnsubj \x1b[0m 2 \x1b[38;5;196msister \x1b[0m sister \x1b[38;5;100mNOUN \x1b[0m \x1b[38;5;100mNN \x1b[0m \x1b[38;5;100mNumber=sing \x1b[0m \n\x1b[38;5;2m╠═════\x1b[0m \x1b[38;5;2mROOT \x1b[0m 3 \x1b[38;5;196mflew \x1b[0m fly \x1b[38;5;100mVERB \x1b[0m \x1b[38;5;100mVBD\x1b[0m \x1b[38;5;100mTense=past|Verb\x1b[0m \n\x1b[38;5;2m╠>╔═══\x1b[0m \x1b[38;5;2mprep \x1b[0m 4 \x1b[38;5;196mto \x1b[0m to \x1b[38;5;100mADP \x1b[0m \x1b[38;5;100mIN \x1b[0m \x1b[38;5;100m \x1b[0m \n\x1b[38;5;2m║ ║ ╔>\x1b[0m \x1b[38;5;2mcompound\x1b[0m 5 \x1b[38;5;196mSilicon\x1b[0m silicon \x1b[38;5;100mPROPN\x1b[0m \x1b[38;5;100mNNP\x1b[0m \x1b[38;5;100mNounType=prop|N\x1b[0m GPE \n\x1b[38;5;2m║ ╚>╚═\x1b[0m \x1b[38;5;2mpobj \x1b[0m 6 \x1b[38;5;196mValley \x1b[0m valley \x1b[38;5;100mPROPN\x1b[0m \x1b[38;5;100mNNP\x1b[0m \x1b[38;5;100mNounType=prop|N\x1b[0m GPE \n\x1b[38;5;2m╠══>╔═\x1b[0m \x1b[38;5;2mprep \x1b[0m 7 \x1b[38;5;196mvia \x1b[0m via \x1b[38;5;100mADP \x1b[0m \x1b[38;5;100mIN \x1b[0m \x1b[38;5;100m \x1b[0m \n\x1b[38;5;2m║ ╚>\x1b[0m \x1b[38;5;2mpobj \x1b[0m 8 \x1b[38;5;196mLondon \x1b[0m london \x1b[38;5;100mPROPN\x1b[0m \x1b[38;5;100mNNP\x1b[0m \x1b[38;5;100mNounType=prop|N\x1b[0m GPE \n\x1b[38;5;2m╚════>\x1b[0m \x1b[38;5;2mpunct \x1b[0m 9 \x1b[38;5;196m. \x1b[0m . \x1b[38;5;100mPUNCT\x1b[0m \x1b[38;5;100m. \x1b[0m \x1b[38;5;100mPunctType=peri \x1b[0m \n\n" if SUPPORTS_ANSI else "\n\x1b[38;5;2m tree\x1b[0m \x1b[38;5;2mdep \x1b[0m index text lemma pos tag \x1b[38;5;100mmorph \x1b[0m ent \n\x1b[38;5;2m------\x1b[0m \x1b[38;5;2m--------\x1b[0m ----- ------- ------- ----- --- \x1b[38;5;100m-------------------------\x1b[0m ------\n\x1b[38;5;2m ╔>╔═\x1b[0m \x1b[38;5;2mposs \x1b[0m 0 Sarah sarah PROPN NNP \x1b[38;5;100mNounType=prop|Number=sing\x1b[0m PERSON\n\x1b[38;5;2m ║ ╚>\x1b[0m \x1b[38;5;2mcase \x1b[0m 1 's 's PART POS \x1b[38;5;100mPoss=yes \x1b[0m \n\x1b[38;5;2m╔>╚═══\x1b[0m \x1b[38;5;2mnsubj \x1b[0m 2 sister sister NOUN NN \x1b[38;5;100mNumber=sing \x1b[0m \n\x1b[38;5;2m╠═════\x1b[0m \x1b[38;5;2mROOT \x1b[0m 3 flew fly VERB VBD \x1b[38;5;100mTense=past|VerbForm=fin \x1b[0m \n\x1b[38;5;2m╠>╔═══\x1b[0m \x1b[38;5;2mprep \x1b[0m 4 to to ADP IN \x1b[38;5;100m \x1b[0m \n\x1b[38;5;2m║ ║ ╔>\x1b[0m \x1b[38;5;2mcompound\x1b[0m 5 Silicon silicon PROPN NNP \x1b[38;5;100mNounType=prop|Number=sing\x1b[0m GPE \n\x1b[38;5;2m║ ╚>╚═\x1b[0m \x1b[38;5;2mpobj \x1b[0m 6 Valley valley PROPN NNP \x1b[38;5;100mNounType=prop|Number=sing\x1b[0m GPE \n\x1b[38;5;2m╠══>╔═\x1b[0m \x1b[38;5;2mprep \x1b[0m 7 via via ADP IN \x1b[38;5;100m \x1b[0m \n\x1b[38;5;2m║ ╚>\x1b[0m \x1b[38;5;2mpobj \x1b[0m 8 London london PROPN NNP \x1b[38;5;100mNounType=prop|Number=sing\x1b[0m GPE \n\x1b[38;5;2m╚════>\x1b[0m \x1b[38;5;2mpunct \x1b[0m 9 . . PUNCT . \x1b[38;5;100mPunctType=peri \x1b[0m \n\n" @@ -456,9 +458,9 @@ def test_viz_rich_render_table_two_sentences( value_dep_bg_colors={"PERSON": 12}, ), ] - print(Visualizer().render(fully_featured_doc_two_sentences, formats, spacing=3)) + print(render_table(fully_featured_doc_two_sentences, formats, spacing=3)) print( - repr(Visualizer().render(fully_featured_doc_two_sentences, formats, spacing=3)) + repr(render_table(fully_featured_doc_two_sentences, formats, spacing=3)) ) target = ( "\n\x1b[38;5;2m tree\x1b[0m \x1b[38;5;2mdep \x1b[0m index text lemma \x1b[38;5;100mpos \x1b[0m \x1b[38;5;100mtag\x1b[0m \x1b[38;5;100mmorph \x1b[0m \x1b[38;5;196ment \x1b[0m\n\x1b[38;5;2m------\x1b[0m \x1b[38;5;2m--------\x1b[0m ----- ------- ------- \x1b[38;5;100m-----\x1b[0m \x1b[38;5;100m---\x1b[0m \x1b[38;5;100m---------------\x1b[0m \x1b[38;5;196m------\x1b[0m\n\x1b[38;5;2m ╔>╔═\x1b[0m \x1b[38;5;2mposs \x1b[0m 0 Sarah sarah \x1b[38;5;100mPROPN\x1b[0m \x1b[38;5;100mNNP\x1b[0m \x1b[38;5;100mNounType=prop|N\x1b[0m \x1b[38;5;196m\x1b[38;5;50;48;5;12mPERSON\x1b[0m\x1b[0m\n\x1b[38;5;2m ║ ╚>\x1b[0m \x1b[38;5;2mcase \x1b[0m 1 's 's \x1b[38;5;100mPART \x1b[0m \x1b[38;5;100mPOS\x1b[0m \x1b[38;5;100mPoss=yes \x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m╔>╚═══\x1b[0m \x1b[38;5;2mnsubj \x1b[0m 2 sister sister \x1b[38;5;100mNOUN \x1b[0m \x1b[38;5;100mNN \x1b[0m \x1b[38;5;100mNumber=sing \x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m╠═════\x1b[0m \x1b[38;5;2mROOT \x1b[0m 3 flew fly \x1b[38;5;100mVERB \x1b[0m \x1b[38;5;100mVBD\x1b[0m \x1b[38;5;100mTense=past|Verb\x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m╠>╔═══\x1b[0m \x1b[38;5;2mprep \x1b[0m 4 to to \x1b[38;5;100mADP \x1b[0m \x1b[38;5;100mIN \x1b[0m \x1b[38;5;100m \x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m║ ║ ╔>\x1b[0m \x1b[38;5;2mcompound\x1b[0m 5 Silicon silicon \x1b[38;5;100mPROPN\x1b[0m \x1b[38;5;100mNNP\x1b[0m \x1b[38;5;100mNounType=prop|N\x1b[0m \x1b[38;5;196mGPE \x1b[0m\n\x1b[38;5;2m║ ╚>╚═\x1b[0m \x1b[38;5;2mpobj \x1b[0m 6 Valley valley \x1b[38;5;100mPROPN\x1b[0m \x1b[38;5;100mNNP\x1b[0m \x1b[38;5;100mNounType=prop|N\x1b[0m \x1b[38;5;196mGPE \x1b[0m\n\x1b[38;5;2m╠══>╔═\x1b[0m \x1b[38;5;2mprep \x1b[0m 7 via via \x1b[38;5;100mADP \x1b[0m \x1b[38;5;100mIN \x1b[0m \x1b[38;5;100m \x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m║ ╚>\x1b[0m \x1b[38;5;2mpobj \x1b[0m 8 London london \x1b[38;5;100mPROPN\x1b[0m \x1b[38;5;100mNNP\x1b[0m \x1b[38;5;100mNounType=prop|N\x1b[0m \x1b[38;5;196mGPE \x1b[0m\n\x1b[38;5;2m╚════>\x1b[0m \x1b[38;5;2mpunct \x1b[0m 9 . . \x1b[38;5;100mPUNCT\x1b[0m \x1b[38;5;100m. \x1b[0m \x1b[38;5;100mPunctType=peri \x1b[0m \x1b[38;5;196m \x1b[0m\n\n\n\x1b[38;5;2mtree\x1b[0m \x1b[38;5;2mdep \x1b[0m index text lemma \x1b[38;5;100mpos \x1b[0m \x1b[38;5;100mtag\x1b[0m \x1b[38;5;100mmorph \x1b[0m \x1b[38;5;196ment\x1b[0m\n\x1b[38;5;2m----\x1b[0m \x1b[38;5;2m-----\x1b[0m ----- ----- ----- \x1b[38;5;100m-----\x1b[0m \x1b[38;5;100m---\x1b[0m \x1b[38;5;100m---------------\x1b[0m \x1b[38;5;196m---\x1b[0m\n\x1b[38;5;2m ╔>\x1b[0m \x1b[38;5;2mnsubj\x1b[0m 10 She she \x1b[38;5;100mPRON \x1b[0m \x1b[38;5;100mPRP\x1b[0m \x1b[38;5;100mCase=Nom|Gender\x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m ╠═\x1b[0m \x1b[38;5;2mROOT \x1b[0m 11 loved love \x1b[38;5;100mVERB \x1b[0m \x1b[38;5;100mVBD\x1b[0m \x1b[38;5;100mTense=Past|Verb\x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m ╠>\x1b[0m \x1b[38;5;2mdobj \x1b[0m 12 it it \x1b[38;5;100mPRON \x1b[0m \x1b[38;5;100mPRP\x1b[0m \x1b[38;5;100mCase=Acc|Gender\x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m ╚>\x1b[0m \x1b[38;5;2mpunct\x1b[0m 13 . . \x1b[38;5;100mPUNCT\x1b[0m \x1b[38;5;100m. \x1b[0m \x1b[38;5;100mPunctType=peri \x1b[0m \x1b[38;5;196m \x1b[0m\n\n" @@ -466,17 +468,17 @@ def test_viz_rich_render_table_two_sentences( else "\n tree dep index text lemma pos tag morph ent \n------ -------- ----- ------- ------- ----- --- --------------- ------\n ╔>╔═ poss 0 Sarah sarah PROPN NNP NounType=prop|N PERSON\n ║ ╚> case 1 's 's PART POS Poss=yes \n╔>╚═══ nsubj 2 sister sister NOUN NN Number=sing \n╠═════ ROOT 3 flew fly VERB VBD Tense=past|Verb \n╠>╔═══ prep 4 to to ADP IN \n║ ║ ╔> compound 5 Silicon silicon PROPN NNP NounType=prop|N GPE \n║ ╚>╚═ pobj 6 Valley valley PROPN NNP NounType=prop|N GPE \n╠══>╔═ prep 7 via via ADP IN \n║ ╚> pobj 8 London london PROPN NNP NounType=prop|N GPE \n╚════> punct 9 . . PUNCT . PunctType=peri \n\n\ntree dep index text lemma pos tag morph ent\n---- ----- ----- ----- ----- ----- --- --------------- ---\n ╔> nsubj 10 She she PRON PRP Case=Nom|Gender \n ╠═ ROOT 11 loved love VERB VBD Tense=Past|Verb \n ╠> dobj 12 it it PRON PRP Case=Acc|Gender \n ╚> punct 13 . . PUNCT . PunctType=peri \n\n" ) assert ( - Visualizer().render(fully_featured_doc_two_sentences, formats, spacing=3) + render_table(fully_featured_doc_two_sentences, formats, spacing=3) == target ) assert ( - Visualizer().render( + render_table( fully_featured_doc_two_sentences, formats, spacing=3, start_i=3, length=300 ) == target ) assert ( - Visualizer().render( + render_table( fully_featured_doc_two_sentences, formats, spacing=3, start_i=3, length=9 ) == target @@ -504,13 +506,13 @@ def test_viz_rich_render_table_start( ), ] print( - Visualizer().render( + render_table( fully_featured_doc_two_sentences, formats, spacing=3, start_i=11 ) ) print( repr( - Visualizer().render( + render_table( fully_featured_doc_two_sentences, formats, spacing=3, start_i=11 ) ) @@ -521,13 +523,13 @@ def test_viz_rich_render_table_start( else "\ntree dep index text lemma pos tag morph ent\n---- ----- ----- ----- ----- ----- --- --------------- ---\n ╔> nsubj 10 She she PRON PRP Case=Nom|Gender \n ╠═ ROOT 11 loved love VERB VBD Tense=Past|Verb \n ╠> dobj 12 it it PRON PRP Case=Acc|Gender \n ╚> punct 13 . . PUNCT . PunctType=peri \n\n" ) assert ( - Visualizer().render( + render_table( fully_featured_doc_two_sentences, formats, spacing=3, start_i=11 ) == target ) assert ( - Visualizer().render( + render_table( fully_featured_doc_two_sentences, formats, spacing=3, @@ -538,7 +540,7 @@ def test_viz_rich_render_table_start( == target ) assert ( - Visualizer().render( + render_table( fully_featured_doc_two_sentences, formats, spacing=3, @@ -549,7 +551,7 @@ def test_viz_rich_render_table_start( == target ) assert ( - Visualizer().render( + render_table( fully_featured_doc_two_sentences, formats, spacing=3, @@ -559,7 +561,7 @@ def test_viz_rich_render_table_start( == target ) assert ( - Visualizer().render( + render_table( fully_featured_doc_two_sentences, formats, spacing=3, @@ -571,7 +573,7 @@ def test_viz_rich_render_table_start( == target ) assert ( - Visualizer().render( + render_table( fully_featured_doc_two_sentences, formats, spacing=3, @@ -581,7 +583,7 @@ def test_viz_rich_render_table_start( == target ) assert ( - Visualizer().render( + render_table( fully_featured_doc_two_sentences, formats, spacing=3, @@ -591,7 +593,7 @@ def test_viz_rich_render_table_start( == "" ) assert ( - Visualizer().render( + render_table( fully_featured_doc_two_sentences, formats, spacing=3, @@ -601,7 +603,7 @@ def test_viz_rich_render_table_start( == "" ) assert ( - Visualizer().render( + render_table( fully_featured_doc_two_sentences, formats, spacing=3, @@ -611,7 +613,7 @@ def test_viz_rich_render_table_start( == "" ) assert ( - Visualizer().render( + render_table( fully_featured_doc_two_sentences, formats, spacing=3, @@ -650,25 +652,25 @@ def test_viz_rich_render_table_end( ) assert ( - Visualizer().render( + render_table( fully_featured_doc_two_sentences, formats, spacing=3, start_i=2 ) == target ) assert ( - Visualizer().render( + render_table( fully_featured_doc_two_sentences, formats, spacing=3, start_i=2, length=3 ) == target ) assert ( - Visualizer().render( + render_table( fully_featured_doc_two_sentences, formats, spacing=3, length=3 ) == target ) assert ( - Visualizer().render( + render_table( fully_featured_doc_two_sentences, formats, spacing=3, diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 075bc4d15..6ef8c5617 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -38,6 +38,7 @@ from .underscore import Underscore, get_ext_args from ._retokenize import Retokenizer from ._serialize import ALL_ATTRS as DOCBIN_ALL_ATTRS from ..util import get_words_and_spaces +from ..visualization import render_document DEF PADDING = 5 @@ -1751,6 +1752,45 @@ cdef class Doc: attrs.extend(intify_attr(x) for x in DOCBIN_ALL_ATTRS) return tuple(attrs) + def inspect( + self, + search_attr_name=None, + search_attr_value=None, + *, + start_i=0, + length=None + ): + """Prints a tabular representation of the document or part of the document. + If part of the document is specified using any of the four optional + parameters, the sentences surrounding that part of the document are rendered; + if none of the four optional parameters are specified, the whole document is + rendered. + + search_attr_name: the name of an attribute to search for in order to + determine where to start rendering, e.g. "lemma_", + or *None* if no search is to be carried out. If either + of *search_attr_name* and *search_attr_value* is *None*, + the behaviour is as if both were *None*. + search_attr_value: the value of an attribute to search for in order to + determine where to start rendering, e.g. "be", + or *None* if no search is to be carried out. If either + of *search_attr_name* and *search_attr_value* is *None*, + the behaviour is as if both were *None*. + start_i: the token index at which to start searching, or at + whose sentence to start rendering. Default: 0. + length: the number of tokens after *start_i* at whose sentence + to stop rendering. If *None*, the rest of the + document is rendered. + """ + print( + render_document( + self, + search_attr_name, + search_attr_value, + start_i=start_i, + length=length + ) + ) cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2: cdef int i = token_by_char(tokens, length, start_char) diff --git a/spacy/visualization.py b/spacy/visualization.py index b00191c6e..fcb066f76 100644 --- a/spacy/visualization.py +++ b/spacy/visualization.py @@ -4,7 +4,6 @@ from re import search from typing import Dict, List, Optional, Union, cast import wasabi from wasabi.util import supports_ansi -from spacy.tokens import Span, Token, Doc SUPPORTS_ANSI = supports_ansi() @@ -57,7 +56,6 @@ class AttributeFormat: max_width: Optional[int] = None, fg_color: Optional[Union[str, int]] = None, bg_color: Optional[Union[str, int]] = None, - permitted_vals: Optional[tuple] = None, value_dep_fg_colors: Optional[Dict[str, Union[str, int]]] = None, value_dep_bg_colors: Optional[Dict[str, Union[str, int]]] = None, ): @@ -78,13 +76,17 @@ class AttributeFormat: self.max_width = max_width self.fg_color = fg_color self.bg_color = bg_color - self.value_dep_fg_colors = value_dep_fg_colors - self.value_dep_bg_colors = value_dep_bg_colors + self.value_dep_fg_colors = ( + value_dep_fg_colors if value_dep_fg_colors is not None else {} + ) + self.value_dep_bg_colors = ( + value_dep_bg_colors if value_dep_bg_colors is not None else {} + ) self.printer = wasabi.Printer(no_print=True) def render( self, - token: Token, + token, *, right_pad_to_len: Optional[int] = None, ignore_colors: bool = False, @@ -93,7 +95,7 @@ class AttributeFormat: right_pad_to_len: the width to which values should be right-padded, or 'None' for no right-padding. ignore_colors: no colors should be rendered, typically because the values are required to calculate widths """ - value = get_token_value(token, self.attribute) + value = _get_token_value(token, self.attribute) if self.max_width is not None: value = value[: self.max_width] fg_color = None @@ -103,315 +105,363 @@ class AttributeFormat: else: right_padding = "" if SUPPORTS_ANSI and not ignore_colors and len(value) > 0: - if self.value_dep_fg_colors is not None: + if len(self.value_dep_fg_colors) > 0: fg_color = self.value_dep_fg_colors.get(value, None) - if self.value_dep_bg_colors is not None: + if len(self.value_dep_bg_colors) > 0: bg_color = self.value_dep_bg_colors.get(value, None) if fg_color is not None or bg_color is not None: value = self.printer.text(value, color=fg_color, bg_color=bg_color) return value + right_padding -class Visualizer: - @staticmethod - def render_dep_tree(sent: Span, root_right: bool) -> List[str]: - """ - Returns an ASCII rendering of the document with a dependency tree for each sentence. The - dependency tree output for a given token has the same index within the output list of - strings as that token within the input document. +def render_dep_tree(sent, root_right: bool) -> List[str]: + """ + Returns an ASCII rendering of the document with a dependency tree for each sentence. The + dependency tree output for a given token has the same index within the output list of + strings as that token within the input document. - root_right: True if the tree should be rendered with the root on the right-hand side, - False if the tree should be rendered with the root on the left-hand side. + root_right: True if the tree should be rendered with the root on the right-hand side, + False if the tree should be rendered with the root on the left-hand side. - Algorithm adapted from https://github.com/KoichiYasuoka/deplacy - """ + Algorithm adapted from https://github.com/KoichiYasuoka/deplacy + """ - # Check sent is really a sentence - if sent.start != sent[0].sent.start or sent.end != sent[0].sent.end: - raise ValueError(f"Span is not a sentence: '{sent}'") - heads: List[Optional[int]] = [] - for token in sent: - if token.dep_.lower() == "root" or token.head.i == token.i: - heads.append(None) - else: - heads.append(token.head.i - sent.start) - # Check there are no head references outside the sentence - heads_outside_sent = [ - 1 for h in heads if h is not None and (h < 0 or h > sent.end - sent.start) - ] - if len(heads_outside_sent) > 0: - raise ValueError(f"Head reference outside sentence in sentence '{sent}'") - children_lists: List[List[int]] = [[] for _ in range(sent.end - sent.start)] - for child, head in enumerate(heads): - if head is not None: - children_lists[head].append(child) - all_ind_ord_by_col: List[int] = [] - # start with the root column - inds_in_this_col = [i for i, h in enumerate(heads) if h is None] - while len(inds_in_this_col) > 0: - all_ind_ord_by_col = inds_in_this_col + all_ind_ord_by_col - inds_in_next_col = [] - # The calculation order of the horizontal lengths of the children - # on either given side of a head must ensure that children - # closer to the head are processed first. - for ind_in_this_col in inds_in_this_col: - following_child_inds = [ - i for i in children_lists[ind_in_this_col] if i > ind_in_this_col - ] - inds_in_next_col.extend(following_child_inds) - preceding_child_inds = [ - i for i in children_lists[ind_in_this_col] if i < ind_in_this_col - ] - preceding_child_inds.reverse() - inds_in_next_col.extend(preceding_child_inds) - inds_in_this_col = inds_in_next_col - horiz_line_lens: List[int] = [] - for i in range(sent.end - sent.start): - if heads[i] is None: - horiz_line_lens.append(-1) - elif len(children_lists[i]) == 0 and abs(cast(int, heads[i]) - i) == 1: - # governed by direct neighbour and has no children itself - horiz_line_lens.append(1) - else: - horiz_line_lens.append(0) - while 0 in horiz_line_lens: - for working_token_ind in ( - i for i in all_ind_ord_by_col if horiz_line_lens[i] == 0 - ): - # render relation between this token and its head - first_ind_in_rel = min( - working_token_ind, - cast(int, heads[working_token_ind]), - ) - second_ind_in_rel = max( - working_token_ind, - cast(int, heads[working_token_ind]), - ) - # If this token has children, they will already have been rendered. - # The line needs to be one character longer than the longest of the - # children's lines. - if len(children_lists[working_token_ind]) > 0: - horiz_line_lens[working_token_ind] = ( - max( - [ - horiz_line_lens[i] - for i in children_lists[working_token_ind] - ] - ) - + 1 - ) - else: - horiz_line_lens[working_token_ind] = 1 - for inbetween_ind in ( - i - for i in range(first_ind_in_rel + 1, second_ind_in_rel) - if horiz_line_lens[i] != 0 - ): - alt_ind: int - if ( - inbetween_ind - in children_lists[cast(int, heads[working_token_ind])] - and inbetween_ind not in children_lists[working_token_ind] - ): - alt_ind = horiz_line_lens[inbetween_ind] - else: - alt_ind = horiz_line_lens[inbetween_ind] + 1 - if alt_ind > horiz_line_lens[working_token_ind]: - horiz_line_lens[working_token_ind] = alt_ind - max_horiz_line_len = max(horiz_line_lens) - char_matrix = [ - [SPACE] * max_horiz_line_len * 2 for _ in range(sent.start, sent.end) - ] - for working_token_ind in range(sent.end - sent.start): - head_token_ind = heads[working_token_ind] - if head_token_ind is None: - continue - first_ind_in_rel = min(working_token_ind, head_token_ind) - second_ind_in_rel = max(working_token_ind, head_token_ind) - char_horiz_line_len = 2 * horiz_line_lens[working_token_ind] - - # Draw the corners of the relation - char_matrix[first_ind_in_rel][char_horiz_line_len - 1] |= ( - HALF_HORIZONTAL_LINE + LOWER_HALF_VERTICAL_LINE - ) - char_matrix[second_ind_in_rel][char_horiz_line_len - 1] |= ( - HALF_HORIZONTAL_LINE + UPPER_HALF_VERTICAL_LINE - ) - - # Draw the horizontal line for the governing token - for working_horiz_pos in range(char_horiz_line_len - 1): - if char_matrix[head_token_ind][working_horiz_pos] != FULL_VERTICAL_LINE: - char_matrix[head_token_ind][ - working_horiz_pos - ] |= FULL_HORIZONTAL_LINE - - # Draw the vertical line for the relation - for working_vert_pos in range(first_ind_in_rel + 1, second_ind_in_rel): - if ( - char_matrix[working_vert_pos][char_horiz_line_len - 1] - != FULL_HORIZONTAL_LINE - ): - char_matrix[working_vert_pos][ - char_horiz_line_len - 1 - ] |= FULL_VERTICAL_LINE + # Check sent is really a sentence + if sent.start != sent[0].sent.start or sent.end != sent[0].sent.end: + raise ValueError(f"Span is not a sentence: '{sent}'") + heads: List[Optional[int]] = [] + for token in sent: + if token.dep_.lower() == "root" or token.head.i == token.i: + heads.append(None) + else: + heads.append(token.head.i - sent.start) + # Check there are no head references outside the sentence + heads_outside_sent = [ + 1 for h in heads if h is not None and (h < 0 or h > sent.end - sent.start) + ] + if len(heads_outside_sent) > 0: + raise ValueError(f"Head reference outside sentence in sentence '{sent}'") + children_lists: List[List[int]] = [[] for _ in range(sent.end - sent.start)] + for child, head in enumerate(heads): + if head is not None: + children_lists[head].append(child) + all_ind_ord_by_col: List[int] = [] + # start with the root column + inds_in_this_col = [i for i, h in enumerate(heads) if h is None] + while len(inds_in_this_col) > 0: + all_ind_ord_by_col = inds_in_this_col + all_ind_ord_by_col + inds_in_next_col = [] + # The calculation order of the horizontal lengths of the children + # on either given side of a head must ensure that children + # closer to the head are processed first. + for ind_in_this_col in inds_in_this_col: + following_child_inds = [ + i for i in children_lists[ind_in_this_col] if i > ind_in_this_col + ] + inds_in_next_col.extend(following_child_inds) + preceding_child_inds = [ + i for i in children_lists[ind_in_this_col] if i < ind_in_this_col + ] + preceding_child_inds.reverse() + inds_in_next_col.extend(preceding_child_inds) + inds_in_this_col = inds_in_next_col + horiz_line_lens: List[int] = [] + for i in range(sent.end - sent.start): + if heads[i] is None: + horiz_line_lens.append(-1) + elif len(children_lists[i]) == 0 and abs(cast(int, heads[i]) - i) == 1: + # governed by direct neighbour and has no children itself + horiz_line_lens.append(1) + else: + horiz_line_lens.append(0) + while 0 in horiz_line_lens: for working_token_ind in ( - i for i in range(sent.end - sent.start) if heads[i] is not None + i for i in all_ind_ord_by_col if horiz_line_lens[i] == 0 ): - for working_horiz_pos in range( - 2 * horiz_line_lens[working_token_ind] - 2, -1, -1 - ): - if ( - ( - char_matrix[working_token_ind][working_horiz_pos] - == FULL_VERTICAL_LINE - ) - and working_horiz_pos > 1 - and char_matrix[working_token_ind][working_horiz_pos - 2] == SPACE - ): - # Cross over the existing vertical line, which is owing to a non-projective tree - continue - if char_matrix[working_token_ind][working_horiz_pos] != SPACE: - # Draw the arrowhead to the right of what is already there - char_matrix[working_token_ind][working_horiz_pos + 1] = ARROWHEAD - break - if working_horiz_pos == 0: - # Draw the arrowhead at the boundary of the diagram - char_matrix[working_token_ind][working_horiz_pos] = ARROWHEAD - else: - # Fill in the horizontal line for the governed token - char_matrix[working_token_ind][ - working_horiz_pos - ] |= FULL_HORIZONTAL_LINE - if root_right: - return [ - "".join( - ROOT_RIGHT_CHARS[char_matrix[vert_pos][horiz_pos]] - for horiz_pos in range((max_horiz_line_len * 2)) - ) - for vert_pos in range(sent.end - sent.start) - ] - else: - return [ - "".join( - ROOT_LEFT_CHARS[char_matrix[vert_pos][horiz_pos]] - for horiz_pos in range((max_horiz_line_len * 2)) - )[::-1] - for vert_pos in range(sent.end - sent.start) - ] - - def render( - self, - doc: Doc, - cols: List[AttributeFormat], - spacing: int = 2, - start_i: int = 0, - length: Optional[int] = None, - search_attr_name: Optional[str] = None, - search_attr_value: Optional[str] = None, - ) -> str: - """Renders a document as a table. - TODO: specify a specific portion of the document to display. - - cols: the attribute formats of the columns to display. - tree_right and tree_left are magic values for the - attributes that render dependency trees where the - roots are on the left or right respectively. - spacing: the number of spaces between each column in the table. - start_i: the token index at which to start searching, or at - whose sentence to start rendering. Default: 0. - length: the number of tokens after *start_i* at whose sentence - to stop rendering. If *None*, the rest of the - document is rendered. - search_attr_name: the name of an attribute to search for in order to - determine where to start rendering, e.g. "lemma_", - or *None* if no search is to be carried out. If either - of *search_attr_name* and *search_attr_value* is *None*, - the behaviour is as if both were *None*. - search_attr_value: the value of an attribute to search for in order to - determine where to start rendering, e.g. "be", - or *None* if no search is to be carried out. If either - of *search_attr_name* and *search_attr_value* is *None*, - the behaviour is as if both were *None*. - """ - return_str = "" - if search_attr_name is not None and search_attr_value is not None: - adj_start_i = get_adjusted_start_i( - doc, start_i, cols, search_attr_name, search_attr_value + # render relation between this token and its head + first_ind_in_rel = min( + working_token_ind, + cast(int, heads[working_token_ind]), ) - else: - adj_start_i = start_i - if adj_start_i >= len(doc): - return return_str - end_i = len(doc) - 1 - if length is not None: - end_i = min(end_i, adj_start_i + length) - elif start_i > 0 or ( - search_attr_name is not None and search_attr_value is not None - ): - end_i = adj_start_i - adj_start_i = doc[adj_start_i].sent.start - end_i = doc[end_i].sent.end - for sent in doc[adj_start_i:end_i].sents: - if "tree_right" in (c.attribute for c in cols): - tree_right = self.render_dep_tree(sent, True) - if "tree_left" in (c.attribute for c in cols): - tree_left = self.render_dep_tree(sent, False) - widths = [] - for col in cols: - # get the values without any color codes - if col.attribute == "tree_left": - width = len(tree_left[0]) # type: ignore - elif col.attribute == "tree_right": - width = len(tree_right[0]) # type: ignore - else: - if len(sent) > 0: - width = max( - len(col.render(token, ignore_colors=True)) for token in sent - ) - else: - width = 0 - if col.max_width is not None: - width = min(width, col.max_width) - width = max(width, len(col.name)) - widths.append(width) - data: List[List[str]] = [] - for token_index, token in enumerate(sent): - inner_data: List[str] = [] - for col_index, col in enumerate(cols): - if col.attribute == "tree_right": - inner_data.append(tree_right[token_index]) - elif col.attribute == "tree_left": - inner_data.append(tree_left[token_index]) - else: - inner_data.append( - col.render(token, right_pad_to_len=widths[col_index]) - ) - data.append(inner_data) - header: Optional[List[str]] - if len([1 for c in cols if len(c.name) > 0]) > 0: - header = [c.name for c in cols] + second_ind_in_rel = max( + working_token_ind, + cast(int, heads[working_token_ind]), + ) + # If this token has children, they will already have been rendered. + # The line needs to be one character longer than the longest of the + # children's lines. + if len(children_lists[working_token_ind]) > 0: + horiz_line_lens[working_token_ind] = ( + max([horiz_line_lens[i] for i in children_lists[working_token_ind]]) + + 1 + ) else: - header = None - aligns = [c.aligns for c in cols] - fg_colors = [c.fg_color for c in cols] - bg_colors = [c.bg_color for c in cols] - return_str += ( - wasabi.table( - data, - header=header, - divider=True, - aligns=aligns, - widths=widths, - fg_colors=fg_colors, - bg_colors=bg_colors, - spacing=spacing, + horiz_line_lens[working_token_ind] = 1 + for inbetween_ind in ( + i + for i in range(first_ind_in_rel + 1, second_ind_in_rel) + if horiz_line_lens[i] != 0 + ): + alt_ind: int + if ( + inbetween_ind in children_lists[cast(int, heads[working_token_ind])] + and inbetween_ind not in children_lists[working_token_ind] + ): + alt_ind = horiz_line_lens[inbetween_ind] + else: + alt_ind = horiz_line_lens[inbetween_ind] + 1 + if alt_ind > horiz_line_lens[working_token_ind]: + horiz_line_lens[working_token_ind] = alt_ind + max_horiz_line_len = max(horiz_line_lens) + char_matrix = [ + [SPACE] * max_horiz_line_len * 2 for _ in range(sent.start, sent.end) + ] + for working_token_ind in range(sent.end - sent.start): + head_token_ind = heads[working_token_ind] + if head_token_ind is None: + continue + first_ind_in_rel = min(working_token_ind, head_token_ind) + second_ind_in_rel = max(working_token_ind, head_token_ind) + char_horiz_line_len = 2 * horiz_line_lens[working_token_ind] + + # Draw the corners of the relation + char_matrix[first_ind_in_rel][char_horiz_line_len - 1] |= ( + HALF_HORIZONTAL_LINE + LOWER_HALF_VERTICAL_LINE + ) + char_matrix[second_ind_in_rel][char_horiz_line_len - 1] |= ( + HALF_HORIZONTAL_LINE + UPPER_HALF_VERTICAL_LINE + ) + + # Draw the horizontal line for the governing token + for working_horiz_pos in range(char_horiz_line_len - 1): + if char_matrix[head_token_ind][working_horiz_pos] != FULL_VERTICAL_LINE: + char_matrix[head_token_ind][working_horiz_pos] |= FULL_HORIZONTAL_LINE + + # Draw the vertical line for the relation + for working_vert_pos in range(first_ind_in_rel + 1, second_ind_in_rel): + if ( + char_matrix[working_vert_pos][char_horiz_line_len - 1] + != FULL_HORIZONTAL_LINE + ): + char_matrix[working_vert_pos][ + char_horiz_line_len - 1 + ] |= FULL_VERTICAL_LINE + for working_token_ind in ( + i for i in range(sent.end - sent.start) if heads[i] is not None + ): + for working_horiz_pos in range( + 2 * horiz_line_lens[working_token_ind] - 2, -1, -1 + ): + if ( + ( + char_matrix[working_token_ind][working_horiz_pos] + == FULL_VERTICAL_LINE ) - + "\n" + and working_horiz_pos > 1 + and char_matrix[working_token_ind][working_horiz_pos - 2] == SPACE + ): + # Cross over the existing vertical line, which is owing to a non-projective tree + continue + if char_matrix[working_token_ind][working_horiz_pos] != SPACE: + # Draw the arrowhead to the right of what is already there + char_matrix[working_token_ind][working_horiz_pos + 1] = ARROWHEAD + break + if working_horiz_pos == 0: + # Draw the arrowhead at the boundary of the diagram + char_matrix[working_token_ind][working_horiz_pos] = ARROWHEAD + else: + # Fill in the horizontal line for the governed token + char_matrix[working_token_ind][ + working_horiz_pos + ] |= FULL_HORIZONTAL_LINE + if root_right: + return [ + "".join( + ROOT_RIGHT_CHARS[char_matrix[vert_pos][horiz_pos]] + for horiz_pos in range((max_horiz_line_len * 2)) ) + for vert_pos in range(sent.end - sent.start) + ] + else: + return [ + "".join( + ROOT_LEFT_CHARS[char_matrix[vert_pos][horiz_pos]] + for horiz_pos in range((max_horiz_line_len * 2)) + )[::-1] + for vert_pos in range(sent.end - sent.start) + ] + + +def render_table( + doc, + cols: List[AttributeFormat], + spacing: int = 3, + search_attr_name: Optional[str] = None, + search_attr_value: Optional[str] = None, + start_i: int = 0, + length: Optional[int] = None, +) -> str: + """Renders a document as a table, allowing the caller to specify various + display options. + + doc: the document. + cols: the attribute formats of the columns to display. + tree_right and tree_left are magic values for the + attributes that render dependency trees where the + roots are on the left or right respectively. + spacing: the number of spaces between each column in the table. + search_attr_name: the name of an attribute to search for in order to + determine where to start rendering, e.g. "lemma_", + or *None* if no search is to be carried out. If either + of *search_attr_name* and *search_attr_value* is *None*, + the behaviour is as if both were *None*. + search_attr_value: the value of an attribute to search for in order to + determine where to start rendering, e.g. "be", + or *None* if no search is to be carried out. If either + of *search_attr_name* and *search_attr_value* is *None*, + the behaviour is as if both were *None*. + start_i: the token index at which to start searching, or at + whose sentence to start rendering. Default: 0. + length: the number of tokens after *start_i* at whose sentence + to stop rendering. If *None*, the rest of the + document is rendered. + """ + return_str = "" + if ( + search_attr_name is not None + and search_attr_name not in ("tree_right", "tree_left") + and search_attr_value is not None + ): + adj_start_i = _get_adjusted_start_i( + doc, start_i, cols, search_attr_name, search_attr_value + ) + else: + adj_start_i = start_i + if adj_start_i >= len(doc): return return_str + end_i = len(doc) - 1 + if length is not None: + end_i = min(end_i, adj_start_i + length) + elif start_i > 0 or ( + search_attr_name is not None and search_attr_value is not None + ): + end_i = adj_start_i + adj_start_i = doc[adj_start_i].sent.start + end_i = doc[end_i].sent.end + for sent in doc[adj_start_i:end_i].sents: + if "tree_right" in (c.attribute for c in cols): + tree_right = render_dep_tree(sent, True) + if "tree_left" in (c.attribute for c in cols): + tree_left = render_dep_tree(sent, False) + widths = [] + for col in cols: + # get the values without any color codes + if col.attribute == "tree_left": + width = len(tree_left[0]) # type: ignore + elif col.attribute == "tree_right": + width = len(tree_right[0]) # type: ignore + else: + if len(sent) > 0: + width = max( + len(col.render(token, ignore_colors=True)) for token in sent + ) + else: + width = 0 + if col.max_width is not None: + width = min(width, col.max_width) + width = max(width, len(col.name)) + widths.append(width) + data: List[List[str]] = [] + for token_index, token in enumerate(sent): + inner_data: List[str] = [] + for col_index, col in enumerate(cols): + if col.attribute == "tree_right": + inner_data.append(tree_right[token_index]) + elif col.attribute == "tree_left": + inner_data.append(tree_left[token_index]) + else: + inner_data.append( + col.render(token, right_pad_to_len=widths[col_index]) + ) + data.append(inner_data) + header: Optional[List[str]] + if len([1 for c in cols if len(c.name) > 0]) > 0: + header = [c.name for c in cols] + else: + header = None + aligns = [c.aligns for c in cols] + fg_colors = [c.fg_color for c in cols] + bg_colors = [c.bg_color for c in cols] + return_str += ( + wasabi.table( + data, + header=header, + divider=True, + aligns=aligns, + widths=widths, + fg_colors=fg_colors, + bg_colors=bg_colors, + spacing=spacing, + ) + + "\n" + ) + return return_str -def get_token_value(token: Token, attribute: str) -> str: +def render_document( + doc, + search_attr_name: Optional[str] = None, + search_attr_value: Optional[str] = None, + *, + start_i: int = 0, + length: Optional[int] = None, +) -> str: + """Renders a document as a table using standard display options. + + doc: the document. + search_attr_name: the name of an attribute to search for in order to + determine where to start rendering, e.g. "lemma_", + or *None* if no search is to be carried out. If either + of *search_attr_name* and *search_attr_value* is *None*, + the behaviour is as if both were *None*. + search_attr_value: the value of an attribute to search for in order to + determine where to start rendering, e.g. "be", + or *None* if no search is to be carried out. If either + of *search_attr_name* and *search_attr_value* is *None*, + the behaviour is as if both were *None*. + start_i: the token index at which to start searching, or at + whose sentence to start rendering. Default: 0. + length: the number of tokens after *start_i* at whose sentence + to stop rendering. If *None*, the rest of the + document is rendered. + """ + cols = [ + AttributeFormat("tree_left", name="tree", aligns="r", fg_color=4), + AttributeFormat("dep_", name="dep_"), + AttributeFormat("ent_type_", name="ent_type_"), + AttributeFormat("i", name="index", aligns="r"), + AttributeFormat("text", name="text", max_width=20), + AttributeFormat("lemma_", name="lemma_", max_width=20), + AttributeFormat("pos_", name="pos_"), + AttributeFormat("tag_", name="tag_"), + AttributeFormat("morph", name="morph_", max_width=60), + ] + if search_attr_name is not None and search_attr_value is not None: + for col in cols: + if col.attribute == search_attr_name or col.name == search_attr_name: + col.value_dep_fg_colors[search_attr_value] = 1 + return render_table( + doc=doc, + cols=cols, + spacing=3, + search_attr_name=search_attr_name, + search_attr_value=search_attr_value, + start_i=start_i, + length=length, + ) + + +def _get_token_value(token, attribute: str) -> str: """ Get value *token.x.y.z*. @@ -422,11 +472,11 @@ def get_token_value(token: Token, attribute: str) -> str: parts = attribute.split(".") for part in parts[:-1]: obj = getattr(obj, part) - return str(getattr(obj, parts[-1])) + return str(getattr(obj, parts[-1])).strip() -def get_adjusted_start_i( - doc: Doc, +def _get_adjusted_start_i( + doc, start_i: int, cols: List[AttributeFormat], search_attr_name: str, @@ -447,7 +497,7 @@ def get_adjusted_start_i( for col in cols: if col.name == search_attr_name or col.attribute == search_attr_name: for token in doc[start_i:]: - if get_token_value(token, col.attribute) == search_attr_value: + if _get_token_value(token, col.attribute) == search_attr_value: return token.i else: return len(doc)