First working version

This commit is contained in:
richardpaulhudson 2023-01-26 19:21:27 +01:00
parent 0ea623990e
commit 9243341f74
3 changed files with 432 additions and 340 deletions

View File

@ -1,6 +1,6 @@
import pytest
from wasabi.util import supports_ansi
from spacy.visualization import AttributeFormat, Visualizer
from spacy.visualization import AttributeFormat, render_dep_tree, render_table
from spacy.tokens import Span, Doc, Token
@ -45,7 +45,7 @@ def test_viz_dep_tree_basic(en_vocab):
heads=[2, 2, 3, None, 6, 6, 3, 3, 3],
deps=["dep"] * 9,
)
dep_tree = Visualizer.render_dep_tree(doc[0 : len(doc)], True)
dep_tree = render_dep_tree(doc[0 : len(doc)], True)
assert dep_tree == [
"<╗ ",
"<╣ ",
@ -57,7 +57,7 @@ def test_viz_dep_tree_basic(en_vocab):
"<══╣",
"<══╝",
]
dep_tree = Visualizer.render_dep_tree(doc[0 : len(doc)], False)
dep_tree = render_dep_tree(doc[0 : len(doc)], False)
assert dep_tree == [
" ╔>",
" ╠>",
@ -92,7 +92,7 @@ def test_viz_dep_tree_non_initial_sent(en_vocab):
heads=[0, None, 0, 5, 5, 6, None, 9, 9, 6, 6, 6],
deps=["dep"] * 12,
)
dep_tree = Visualizer.render_dep_tree(doc[3 : len(doc)], True)
dep_tree = render_dep_tree(doc[3 : len(doc)], True)
assert dep_tree == [
"<╗ ",
"<╣ ",
@ -104,7 +104,7 @@ def test_viz_dep_tree_non_initial_sent(en_vocab):
"<══╣",
"<══╝",
]
dep_tree = Visualizer.render_dep_tree(doc[3 : len(doc)], False)
dep_tree = render_dep_tree(doc[3 : len(doc)], False)
assert dep_tree == [
" ╔>",
" ╠>",
@ -120,7 +120,7 @@ def test_viz_dep_tree_non_initial_sent(en_vocab):
def test_viz_dep_tree_non_projective(horse_doc):
"""Test dependency tree display with a non-projective dependency."""
dep_tree = Visualizer.render_dep_tree(horse_doc[0 : len(horse_doc)], True)
dep_tree = render_dep_tree(horse_doc[0 : len(horse_doc)], True)
assert dep_tree == [
"<╗ ",
"═╩═══╗",
@ -132,7 +132,7 @@ def test_viz_dep_tree_non_projective(horse_doc):
"═╝<╝ ║",
"<════╝",
]
dep_tree = Visualizer.render_dep_tree(horse_doc[0 : len(horse_doc)], False)
dep_tree = render_dep_tree(horse_doc[0 : len(horse_doc)], False)
assert dep_tree == [
" ╔>",
"╔═══╩═",
@ -163,7 +163,7 @@ def test_viz_dep_tree_highly_nonprojective(pl_vocab):
heads=[5, 5, 0, 5, 5, None, 4, 5],
deps=["dep"] * 8,
)
dep_tree = Visualizer.render_dep_tree(doc[0 : len(doc)], True)
dep_tree = render_dep_tree(doc[0 : len(doc)], True)
assert dep_tree == [
"═╗<╗",
" ║<╣",
@ -174,7 +174,7 @@ def test_viz_dep_tree_highly_nonprojective(pl_vocab):
"<╝ ║",
"<══╝",
]
dep_tree = Visualizer.render_dep_tree(doc[0 : len(doc)], False)
dep_tree = render_dep_tree(doc[0 : len(doc)], False)
assert dep_tree == [
"╔>╔═",
"╠>║ ",
@ -190,7 +190,7 @@ def test_viz_dep_tree_highly_nonprojective(pl_vocab):
def test_viz_dep_tree_input_not_span(horse_doc):
"""Test dependency tree display behaviour when the input is not a Span."""
with pytest.raises(ValueError):
Visualizer.render_dep_tree(horse_doc[1:3], True)
render_dep_tree(horse_doc[1:3], True)
def test_viz_render_native_attributes(horse_doc):
@ -199,7 +199,10 @@ def test_viz_render_native_attributes(horse_doc):
assert AttributeFormat("dep_").render(horse_doc[2]) == "dep"
with pytest.raises(AttributeError):
AttributeFormat("depp").render(horse_doc[2])
with pytest.raises(AttributeError):
AttributeFormat("tree_left").render(horse_doc[2])
with pytest.raises(AttributeError):
AttributeFormat("tree_right").render(horse_doc[2])
def test_viz_render_colors(horse_doc):
assert (
@ -265,7 +268,7 @@ def test_viz_minimal_render_table_one_sentence(
AttributeFormat("ent_type_"),
]
assert (
Visualizer().render(fully_featured_doc_one_sentence, formats, spacing=3).strip()
render_table(fully_featured_doc_one_sentence, formats, spacing=3).strip()
== """
> poss Sarah sarah PROPN NNP NounType=prop|Number=sing PERSON
> case 's 's PART POS Poss=yes
@ -295,7 +298,7 @@ def test_viz_minimal_render_table_empty_text(
AttributeFormat("morph"),
AttributeFormat("ent_type_"),
]
assert Visualizer().render(Doc(en_vocab), formats, spacing=3).strip() == ""
assert render_table(Doc(en_vocab), formats, spacing=3).strip() == ""
# headers
formats = [
@ -308,7 +311,7 @@ def test_viz_minimal_render_table_empty_text(
AttributeFormat("morph"),
AttributeFormat("ent_type_", name="ent"),
]
assert Visualizer().render(Doc(en_vocab), formats, spacing=3).strip() == ""
assert render_table(Doc(en_vocab), formats, spacing=3).strip() == ""
def test_viz_minimal_render_table_spacing(
@ -325,7 +328,7 @@ def test_viz_minimal_render_table_spacing(
AttributeFormat("ent_type_"),
]
assert (
Visualizer().render(fully_featured_doc_one_sentence, formats, spacing=1).strip()
render_table(fully_featured_doc_one_sentence, formats, spacing=1).strip()
== """
> poss Sarah sarah PROPN NNP NounType=prop|Number=sing PERSON
> case 's 's PART POS Poss=yes
@ -356,8 +359,7 @@ def test_viz_minimal_render_table_two_sentences(
]
assert (
Visualizer()
.render(fully_featured_doc_two_sentences, formats, spacing=3)
render_table(fully_featured_doc_two_sentences, formats, spacing=3)
.strip()
== """
> poss Sarah sarah PROPN NNP NounType=prop|Number=sing PERSON
@ -401,7 +403,7 @@ def test_viz_rich_render_table_one_sentence(
),
]
assert (
Visualizer().render(fully_featured_doc_one_sentence, formats, spacing=3)
render_table(fully_featured_doc_one_sentence, formats, spacing=3)
== "\n\x1b[38;5;2m tree\x1b[0m \x1b[38;5;2mdep \x1b[0m index text lemma \x1b[38;5;100mpos \x1b[0m \x1b[38;5;100mtag\x1b[0m \x1b[38;5;100mmorph \x1b[0m \x1b[38;5;196ment \x1b[0m\n\x1b[38;5;2m------\x1b[0m \x1b[38;5;2m--------\x1b[0m ----- ------- ------- \x1b[38;5;100m-----\x1b[0m \x1b[38;5;100m---\x1b[0m \x1b[38;5;100m---------------\x1b[0m \x1b[38;5;196m------\x1b[0m\n\x1b[38;5;2m ╔>╔═\x1b[0m \x1b[38;5;2mposs \x1b[0m 0 Sarah sarah \x1b[38;5;100mPROPN\x1b[0m \x1b[38;5;100mNNP\x1b[0m \x1b[38;5;100mNounType=prop|N\x1b[0m \x1b[38;5;196m\x1b[38;5;50;48;5;12mPERSON\x1b[0m\x1b[0m\n\x1b[38;5;2m ║ ╚>\x1b[0m \x1b[38;5;2mcase \x1b[0m 1 's 's \x1b[38;5;100mPART \x1b[0m \x1b[38;5;100mPOS\x1b[0m \x1b[38;5;100mPoss=yes \x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m╔>╚═══\x1b[0m \x1b[38;5;2mnsubj \x1b[0m 2 sister sister \x1b[38;5;100mNOUN \x1b[0m \x1b[38;5;100mNN \x1b[0m \x1b[38;5;100mNumber=sing \x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m╠═════\x1b[0m \x1b[38;5;2mROOT \x1b[0m 3 flew fly \x1b[38;5;100mVERB \x1b[0m \x1b[38;5;100mVBD\x1b[0m \x1b[38;5;100mTense=past|Verb\x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m╠>╔═══\x1b[0m \x1b[38;5;2mprep \x1b[0m 4 to to \x1b[38;5;100mADP \x1b[0m \x1b[38;5;100mIN \x1b[0m \x1b[38;5;100m \x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m║ ║ ╔>\x1b[0m \x1b[38;5;2mcompound\x1b[0m 5 Silicon silicon \x1b[38;5;100mPROPN\x1b[0m \x1b[38;5;100mNNP\x1b[0m \x1b[38;5;100mNounType=prop|N\x1b[0m \x1b[38;5;196mGPE \x1b[0m\n\x1b[38;5;2m║ ╚>╚═\x1b[0m \x1b[38;5;2mpobj \x1b[0m 6 Valley valley \x1b[38;5;100mPROPN\x1b[0m \x1b[38;5;100mNNP\x1b[0m \x1b[38;5;100mNounType=prop|N\x1b[0m \x1b[38;5;196mGPE \x1b[0m\n\x1b[38;5;2m╠══>╔═\x1b[0m \x1b[38;5;2mprep \x1b[0m 7 via via \x1b[38;5;100mADP \x1b[0m \x1b[38;5;100mIN \x1b[0m \x1b[38;5;100m \x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m║ ╚>\x1b[0m \x1b[38;5;2mpobj \x1b[0m 8 London london \x1b[38;5;100mPROPN\x1b[0m \x1b[38;5;100mNNP\x1b[0m \x1b[38;5;100mNounType=prop|N\x1b[0m \x1b[38;5;196mGPE \x1b[0m\n\x1b[38;5;2m╚════>\x1b[0m \x1b[38;5;2mpunct \x1b[0m 9 . . \x1b[38;5;100mPUNCT\x1b[0m \x1b[38;5;100m. \x1b[0m \x1b[38;5;100mPunctType=peri \x1b[0m \x1b[38;5;196m \x1b[0m\n\n"
if SUPPORTS_ANSI
else "\n\x1b[38;5;2m tree\x1b[0m \x1b[38;5;2mdep \x1b[0m index text lemma pos tag morph ent \n\x1b[38;5;2m------\x1b[0m \x1b[38;5;2m--------\x1b[0m ----- ------- ------- ----- --- --------------- ------\n\x1b[38;5;2m ╔>╔═\x1b[0m \x1b[38;5;2mposs \x1b[0m 0 Sarah sarah PROPN NNP NounType=prop|N PERSON\n\x1b[38;5;2m ║ ╚>\x1b[0m \x1b[38;5;2mcase \x1b[0m 1 's 's PART POS Poss=yes \n\x1b[38;5;2m╔>╚═══\x1b[0m \x1b[38;5;2mnsubj \x1b[0m 2 sister sister NOUN NN Number=sing \n\x1b[38;5;2m╠═════\x1b[0m \x1b[38;5;2mROOT \x1b[0m 3 flew fly VERB VBD Tense=past|Verb \n\x1b[38;5;2m╠>╔═══\x1b[0m \x1b[38;5;2mprep \x1b[0m 4 to to ADP IN \n\x1b[38;5;2m║ ║ ╔>\x1b[0m \x1b[38;5;2mcompound\x1b[0m 5 Silicon silicon PROPN NNP NounType=prop|N GPE \n\x1b[38;5;2m║ ╚>╚═\x1b[0m \x1b[38;5;2mpobj \x1b[0m 6 Valley valley PROPN NNP NounType=prop|N GPE \n\x1b[38;5;2m╠══>╔═\x1b[0m \x1b[38;5;2mprep \x1b[0m 7 via via ADP IN \n\x1b[38;5;2m║ ╚>\x1b[0m \x1b[38;5;2mpobj \x1b[0m 8 London london PROPN NNP NounType=prop|N GPE \n\x1b[38;5;2m╚════>\x1b[0m \x1b[38;5;2mpunct \x1b[0m 9 . . PUNCT . PunctType=peri \n\n"
@ -429,7 +431,7 @@ def test_viz_rich_render_table_one_sentence(
),
]
assert (
Visualizer().render(fully_featured_doc_one_sentence, formats, spacing=3)
render_table(fully_featured_doc_one_sentence, formats, spacing=3)
== "\n\x1b[38;5;2m tree\x1b[0m \x1b[38;5;2mdep \x1b[0m index \x1b[38;5;196mtext \x1b[0m lemma \x1b[38;5;100mpos \x1b[0m \x1b[38;5;100mtag\x1b[0m \x1b[38;5;100mmorph \x1b[0m ent \n\x1b[38;5;2m------\x1b[0m \x1b[38;5;2m--------\x1b[0m ----- \x1b[38;5;196m-------\x1b[0m ------- \x1b[38;5;100m-----\x1b[0m \x1b[38;5;100m---\x1b[0m \x1b[38;5;100m---------------\x1b[0m ------\n\x1b[38;5;2m ╔>╔═\x1b[0m \x1b[38;5;2mposs \x1b[0m 0 \x1b[38;5;196mSarah \x1b[0m sarah \x1b[38;5;100mPROPN\x1b[0m \x1b[38;5;100mNNP\x1b[0m \x1b[38;5;100mNounType=prop|N\x1b[0m PERSON\n\x1b[38;5;2m ║ ╚>\x1b[0m \x1b[38;5;2mcase \x1b[0m 1 \x1b[38;5;196m\x1b[38;5;50;48;5;12m's\x1b[0m \x1b[0m 's \x1b[38;5;100mPART \x1b[0m \x1b[38;5;100mPOS\x1b[0m \x1b[38;5;100mPoss=yes \x1b[0m \n\x1b[38;5;2m╔>╚═══\x1b[0m \x1b[38;5;2mnsubj \x1b[0m 2 \x1b[38;5;196msister \x1b[0m sister \x1b[38;5;100mNOUN \x1b[0m \x1b[38;5;100mNN \x1b[0m \x1b[38;5;100mNumber=sing \x1b[0m \n\x1b[38;5;2m╠═════\x1b[0m \x1b[38;5;2mROOT \x1b[0m 3 \x1b[38;5;196mflew \x1b[0m fly \x1b[38;5;100mVERB \x1b[0m \x1b[38;5;100mVBD\x1b[0m \x1b[38;5;100mTense=past|Verb\x1b[0m \n\x1b[38;5;2m╠>╔═══\x1b[0m \x1b[38;5;2mprep \x1b[0m 4 \x1b[38;5;196mto \x1b[0m to \x1b[38;5;100mADP \x1b[0m \x1b[38;5;100mIN \x1b[0m \x1b[38;5;100m \x1b[0m \n\x1b[38;5;2m║ ║ ╔>\x1b[0m \x1b[38;5;2mcompound\x1b[0m 5 \x1b[38;5;196mSilicon\x1b[0m silicon \x1b[38;5;100mPROPN\x1b[0m \x1b[38;5;100mNNP\x1b[0m \x1b[38;5;100mNounType=prop|N\x1b[0m GPE \n\x1b[38;5;2m║ ╚>╚═\x1b[0m \x1b[38;5;2mpobj \x1b[0m 6 \x1b[38;5;196mValley \x1b[0m valley \x1b[38;5;100mPROPN\x1b[0m \x1b[38;5;100mNNP\x1b[0m \x1b[38;5;100mNounType=prop|N\x1b[0m GPE \n\x1b[38;5;2m╠══>╔═\x1b[0m \x1b[38;5;2mprep \x1b[0m 7 \x1b[38;5;196mvia \x1b[0m via \x1b[38;5;100mADP \x1b[0m \x1b[38;5;100mIN \x1b[0m \x1b[38;5;100m \x1b[0m \n\x1b[38;5;2m║ ╚>\x1b[0m \x1b[38;5;2mpobj \x1b[0m 8 \x1b[38;5;196mLondon \x1b[0m london \x1b[38;5;100mPROPN\x1b[0m \x1b[38;5;100mNNP\x1b[0m \x1b[38;5;100mNounType=prop|N\x1b[0m GPE \n\x1b[38;5;2m╚════>\x1b[0m \x1b[38;5;2mpunct \x1b[0m 9 \x1b[38;5;196m. \x1b[0m . \x1b[38;5;100mPUNCT\x1b[0m \x1b[38;5;100m. \x1b[0m \x1b[38;5;100mPunctType=peri \x1b[0m \n\n"
if SUPPORTS_ANSI
else "\n\x1b[38;5;2m tree\x1b[0m \x1b[38;5;2mdep \x1b[0m index text lemma pos tag \x1b[38;5;100mmorph \x1b[0m ent \n\x1b[38;5;2m------\x1b[0m \x1b[38;5;2m--------\x1b[0m ----- ------- ------- ----- --- \x1b[38;5;100m-------------------------\x1b[0m ------\n\x1b[38;5;2m ╔>╔═\x1b[0m \x1b[38;5;2mposs \x1b[0m 0 Sarah sarah PROPN NNP \x1b[38;5;100mNounType=prop|Number=sing\x1b[0m PERSON\n\x1b[38;5;2m ║ ╚>\x1b[0m \x1b[38;5;2mcase \x1b[0m 1 's 's PART POS \x1b[38;5;100mPoss=yes \x1b[0m \n\x1b[38;5;2m╔>╚═══\x1b[0m \x1b[38;5;2mnsubj \x1b[0m 2 sister sister NOUN NN \x1b[38;5;100mNumber=sing \x1b[0m \n\x1b[38;5;2m╠═════\x1b[0m \x1b[38;5;2mROOT \x1b[0m 3 flew fly VERB VBD \x1b[38;5;100mTense=past|VerbForm=fin \x1b[0m \n\x1b[38;5;2m╠>╔═══\x1b[0m \x1b[38;5;2mprep \x1b[0m 4 to to ADP IN \x1b[38;5;100m \x1b[0m \n\x1b[38;5;2m║ ║ ╔>\x1b[0m \x1b[38;5;2mcompound\x1b[0m 5 Silicon silicon PROPN NNP \x1b[38;5;100mNounType=prop|Number=sing\x1b[0m GPE \n\x1b[38;5;2m║ ╚>╚═\x1b[0m \x1b[38;5;2mpobj \x1b[0m 6 Valley valley PROPN NNP \x1b[38;5;100mNounType=prop|Number=sing\x1b[0m GPE \n\x1b[38;5;2m╠══>╔═\x1b[0m \x1b[38;5;2mprep \x1b[0m 7 via via ADP IN \x1b[38;5;100m \x1b[0m \n\x1b[38;5;2m║ ╚>\x1b[0m \x1b[38;5;2mpobj \x1b[0m 8 London london PROPN NNP \x1b[38;5;100mNounType=prop|Number=sing\x1b[0m GPE \n\x1b[38;5;2m╚════>\x1b[0m \x1b[38;5;2mpunct \x1b[0m 9 . . PUNCT . \x1b[38;5;100mPunctType=peri \x1b[0m \n\n"
@ -456,9 +458,9 @@ def test_viz_rich_render_table_two_sentences(
value_dep_bg_colors={"PERSON": 12},
),
]
print(Visualizer().render(fully_featured_doc_two_sentences, formats, spacing=3))
print(render_table(fully_featured_doc_two_sentences, formats, spacing=3))
print(
repr(Visualizer().render(fully_featured_doc_two_sentences, formats, spacing=3))
repr(render_table(fully_featured_doc_two_sentences, formats, spacing=3))
)
target = (
"\n\x1b[38;5;2m tree\x1b[0m \x1b[38;5;2mdep \x1b[0m index text lemma \x1b[38;5;100mpos \x1b[0m \x1b[38;5;100mtag\x1b[0m \x1b[38;5;100mmorph \x1b[0m \x1b[38;5;196ment \x1b[0m\n\x1b[38;5;2m------\x1b[0m \x1b[38;5;2m--------\x1b[0m ----- ------- ------- \x1b[38;5;100m-----\x1b[0m \x1b[38;5;100m---\x1b[0m \x1b[38;5;100m---------------\x1b[0m \x1b[38;5;196m------\x1b[0m\n\x1b[38;5;2m ╔>╔═\x1b[0m \x1b[38;5;2mposs \x1b[0m 0 Sarah sarah \x1b[38;5;100mPROPN\x1b[0m \x1b[38;5;100mNNP\x1b[0m \x1b[38;5;100mNounType=prop|N\x1b[0m \x1b[38;5;196m\x1b[38;5;50;48;5;12mPERSON\x1b[0m\x1b[0m\n\x1b[38;5;2m ║ ╚>\x1b[0m \x1b[38;5;2mcase \x1b[0m 1 's 's \x1b[38;5;100mPART \x1b[0m \x1b[38;5;100mPOS\x1b[0m \x1b[38;5;100mPoss=yes \x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m╔>╚═══\x1b[0m \x1b[38;5;2mnsubj \x1b[0m 2 sister sister \x1b[38;5;100mNOUN \x1b[0m \x1b[38;5;100mNN \x1b[0m \x1b[38;5;100mNumber=sing \x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m╠═════\x1b[0m \x1b[38;5;2mROOT \x1b[0m 3 flew fly \x1b[38;5;100mVERB \x1b[0m \x1b[38;5;100mVBD\x1b[0m \x1b[38;5;100mTense=past|Verb\x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m╠>╔═══\x1b[0m \x1b[38;5;2mprep \x1b[0m 4 to to \x1b[38;5;100mADP \x1b[0m \x1b[38;5;100mIN \x1b[0m \x1b[38;5;100m \x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m║ ║ ╔>\x1b[0m \x1b[38;5;2mcompound\x1b[0m 5 Silicon silicon \x1b[38;5;100mPROPN\x1b[0m \x1b[38;5;100mNNP\x1b[0m \x1b[38;5;100mNounType=prop|N\x1b[0m \x1b[38;5;196mGPE \x1b[0m\n\x1b[38;5;2m║ ╚>╚═\x1b[0m \x1b[38;5;2mpobj \x1b[0m 6 Valley valley \x1b[38;5;100mPROPN\x1b[0m \x1b[38;5;100mNNP\x1b[0m \x1b[38;5;100mNounType=prop|N\x1b[0m \x1b[38;5;196mGPE \x1b[0m\n\x1b[38;5;2m╠══>╔═\x1b[0m \x1b[38;5;2mprep \x1b[0m 7 via via \x1b[38;5;100mADP \x1b[0m \x1b[38;5;100mIN \x1b[0m \x1b[38;5;100m \x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m║ ╚>\x1b[0m \x1b[38;5;2mpobj \x1b[0m 8 London london \x1b[38;5;100mPROPN\x1b[0m \x1b[38;5;100mNNP\x1b[0m \x1b[38;5;100mNounType=prop|N\x1b[0m \x1b[38;5;196mGPE \x1b[0m\n\x1b[38;5;2m╚════>\x1b[0m \x1b[38;5;2mpunct \x1b[0m 9 . . \x1b[38;5;100mPUNCT\x1b[0m \x1b[38;5;100m. \x1b[0m \x1b[38;5;100mPunctType=peri \x1b[0m \x1b[38;5;196m \x1b[0m\n\n\n\x1b[38;5;2mtree\x1b[0m \x1b[38;5;2mdep \x1b[0m index text lemma \x1b[38;5;100mpos \x1b[0m \x1b[38;5;100mtag\x1b[0m \x1b[38;5;100mmorph \x1b[0m \x1b[38;5;196ment\x1b[0m\n\x1b[38;5;2m----\x1b[0m \x1b[38;5;2m-----\x1b[0m ----- ----- ----- \x1b[38;5;100m-----\x1b[0m \x1b[38;5;100m---\x1b[0m \x1b[38;5;100m---------------\x1b[0m \x1b[38;5;196m---\x1b[0m\n\x1b[38;5;2m ╔>\x1b[0m \x1b[38;5;2mnsubj\x1b[0m 10 She she \x1b[38;5;100mPRON \x1b[0m \x1b[38;5;100mPRP\x1b[0m \x1b[38;5;100mCase=Nom|Gender\x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m ╠═\x1b[0m \x1b[38;5;2mROOT \x1b[0m 11 loved love \x1b[38;5;100mVERB \x1b[0m \x1b[38;5;100mVBD\x1b[0m \x1b[38;5;100mTense=Past|Verb\x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m ╠>\x1b[0m \x1b[38;5;2mdobj \x1b[0m 12 it it \x1b[38;5;100mPRON \x1b[0m \x1b[38;5;100mPRP\x1b[0m \x1b[38;5;100mCase=Acc|Gender\x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m ╚>\x1b[0m \x1b[38;5;2mpunct\x1b[0m 13 . . \x1b[38;5;100mPUNCT\x1b[0m \x1b[38;5;100m. \x1b[0m \x1b[38;5;100mPunctType=peri \x1b[0m \x1b[38;5;196m \x1b[0m\n\n"
@ -466,17 +468,17 @@ def test_viz_rich_render_table_two_sentences(
else "\n tree dep index text lemma pos tag morph ent \n------ -------- ----- ------- ------- ----- --- --------------- ------\n ╔>╔═ poss 0 Sarah sarah PROPN NNP NounType=prop|N PERSON\n ║ ╚> case 1 's 's PART POS Poss=yes \n╔>╚═══ nsubj 2 sister sister NOUN NN Number=sing \n╠═════ ROOT 3 flew fly VERB VBD Tense=past|Verb \n╠>╔═══ prep 4 to to ADP IN \n║ ║ ╔> compound 5 Silicon silicon PROPN NNP NounType=prop|N GPE \n║ ╚>╚═ pobj 6 Valley valley PROPN NNP NounType=prop|N GPE \n╠══>╔═ prep 7 via via ADP IN \n║ ╚> pobj 8 London london PROPN NNP NounType=prop|N GPE \n╚════> punct 9 . . PUNCT . PunctType=peri \n\n\ntree dep index text lemma pos tag morph ent\n---- ----- ----- ----- ----- ----- --- --------------- ---\n ╔> nsubj 10 She she PRON PRP Case=Nom|Gender \n ╠═ ROOT 11 loved love VERB VBD Tense=Past|Verb \n ╠> dobj 12 it it PRON PRP Case=Acc|Gender \n ╚> punct 13 . . PUNCT . PunctType=peri \n\n"
)
assert (
Visualizer().render(fully_featured_doc_two_sentences, formats, spacing=3)
render_table(fully_featured_doc_two_sentences, formats, spacing=3)
== target
)
assert (
Visualizer().render(
render_table(
fully_featured_doc_two_sentences, formats, spacing=3, start_i=3, length=300
)
== target
)
assert (
Visualizer().render(
render_table(
fully_featured_doc_two_sentences, formats, spacing=3, start_i=3, length=9
)
== target
@ -504,13 +506,13 @@ def test_viz_rich_render_table_start(
),
]
print(
Visualizer().render(
render_table(
fully_featured_doc_two_sentences, formats, spacing=3, start_i=11
)
)
print(
repr(
Visualizer().render(
render_table(
fully_featured_doc_two_sentences, formats, spacing=3, start_i=11
)
)
@ -521,13 +523,13 @@ def test_viz_rich_render_table_start(
else "\ntree dep index text lemma pos tag morph ent\n---- ----- ----- ----- ----- ----- --- --------------- ---\n ╔> nsubj 10 She she PRON PRP Case=Nom|Gender \n ╠═ ROOT 11 loved love VERB VBD Tense=Past|Verb \n ╠> dobj 12 it it PRON PRP Case=Acc|Gender \n ╚> punct 13 . . PUNCT . PunctType=peri \n\n"
)
assert (
Visualizer().render(
render_table(
fully_featured_doc_two_sentences, formats, spacing=3, start_i=11
)
== target
)
assert (
Visualizer().render(
render_table(
fully_featured_doc_two_sentences,
formats,
spacing=3,
@ -538,7 +540,7 @@ def test_viz_rich_render_table_start(
== target
)
assert (
Visualizer().render(
render_table(
fully_featured_doc_two_sentences,
formats,
spacing=3,
@ -549,7 +551,7 @@ def test_viz_rich_render_table_start(
== target
)
assert (
Visualizer().render(
render_table(
fully_featured_doc_two_sentences,
formats,
spacing=3,
@ -559,7 +561,7 @@ def test_viz_rich_render_table_start(
== target
)
assert (
Visualizer().render(
render_table(
fully_featured_doc_two_sentences,
formats,
spacing=3,
@ -571,7 +573,7 @@ def test_viz_rich_render_table_start(
== target
)
assert (
Visualizer().render(
render_table(
fully_featured_doc_two_sentences,
formats,
spacing=3,
@ -581,7 +583,7 @@ def test_viz_rich_render_table_start(
== target
)
assert (
Visualizer().render(
render_table(
fully_featured_doc_two_sentences,
formats,
spacing=3,
@ -591,7 +593,7 @@ def test_viz_rich_render_table_start(
== ""
)
assert (
Visualizer().render(
render_table(
fully_featured_doc_two_sentences,
formats,
spacing=3,
@ -601,7 +603,7 @@ def test_viz_rich_render_table_start(
== ""
)
assert (
Visualizer().render(
render_table(
fully_featured_doc_two_sentences,
formats,
spacing=3,
@ -611,7 +613,7 @@ def test_viz_rich_render_table_start(
== ""
)
assert (
Visualizer().render(
render_table(
fully_featured_doc_two_sentences,
formats,
spacing=3,
@ -650,25 +652,25 @@ def test_viz_rich_render_table_end(
)
assert (
Visualizer().render(
render_table(
fully_featured_doc_two_sentences, formats, spacing=3, start_i=2
)
== target
)
assert (
Visualizer().render(
render_table(
fully_featured_doc_two_sentences, formats, spacing=3, start_i=2, length=3
)
== target
)
assert (
Visualizer().render(
render_table(
fully_featured_doc_two_sentences, formats, spacing=3, length=3
)
== target
)
assert (
Visualizer().render(
render_table(
fully_featured_doc_two_sentences,
formats,
spacing=3,

View File

@ -38,6 +38,7 @@ from .underscore import Underscore, get_ext_args
from ._retokenize import Retokenizer
from ._serialize import ALL_ATTRS as DOCBIN_ALL_ATTRS
from ..util import get_words_and_spaces
from ..visualization import render_document
DEF PADDING = 5
@ -1751,6 +1752,45 @@ cdef class Doc:
attrs.extend(intify_attr(x) for x in DOCBIN_ALL_ATTRS)
return tuple(attrs)
def inspect(
self,
search_attr_name=None,
search_attr_value=None,
*,
start_i=0,
length=None
):
"""Prints a tabular representation of the document or part of the document.
If part of the document is specified using any of the four optional
parameters, the sentences surrounding that part of the document are rendered;
if none of the four optional parameters are specified, the whole document is
rendered.
search_attr_name: the name of an attribute to search for in order to
determine where to start rendering, e.g. "lemma_",
or *None* if no search is to be carried out. If either
of *search_attr_name* and *search_attr_value* is *None*,
the behaviour is as if both were *None*.
search_attr_value: the value of an attribute to search for in order to
determine where to start rendering, e.g. "be",
or *None* if no search is to be carried out. If either
of *search_attr_name* and *search_attr_value* is *None*,
the behaviour is as if both were *None*.
start_i: the token index at which to start searching, or at
whose sentence to start rendering. Default: 0.
length: the number of tokens after *start_i* at whose sentence
to stop rendering. If *None*, the rest of the
document is rendered.
"""
print(
render_document(
self,
search_attr_name,
search_attr_value,
start_i=start_i,
length=length
)
)
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:
cdef int i = token_by_char(tokens, length, start_char)

View File

@ -4,7 +4,6 @@ from re import search
from typing import Dict, List, Optional, Union, cast
import wasabi
from wasabi.util import supports_ansi
from spacy.tokens import Span, Token, Doc
SUPPORTS_ANSI = supports_ansi()
@ -57,7 +56,6 @@ class AttributeFormat:
max_width: Optional[int] = None,
fg_color: Optional[Union[str, int]] = None,
bg_color: Optional[Union[str, int]] = None,
permitted_vals: Optional[tuple] = None,
value_dep_fg_colors: Optional[Dict[str, Union[str, int]]] = None,
value_dep_bg_colors: Optional[Dict[str, Union[str, int]]] = None,
):
@ -78,13 +76,17 @@ class AttributeFormat:
self.max_width = max_width
self.fg_color = fg_color
self.bg_color = bg_color
self.value_dep_fg_colors = value_dep_fg_colors
self.value_dep_bg_colors = value_dep_bg_colors
self.value_dep_fg_colors = (
value_dep_fg_colors if value_dep_fg_colors is not None else {}
)
self.value_dep_bg_colors = (
value_dep_bg_colors if value_dep_bg_colors is not None else {}
)
self.printer = wasabi.Printer(no_print=True)
def render(
self,
token: Token,
token,
*,
right_pad_to_len: Optional[int] = None,
ignore_colors: bool = False,
@ -93,7 +95,7 @@ class AttributeFormat:
right_pad_to_len: the width to which values should be right-padded, or 'None' for no right-padding.
ignore_colors: no colors should be rendered, typically because the values are required to calculate widths
"""
value = get_token_value(token, self.attribute)
value = _get_token_value(token, self.attribute)
if self.max_width is not None:
value = value[: self.max_width]
fg_color = None
@ -103,315 +105,363 @@ class AttributeFormat:
else:
right_padding = ""
if SUPPORTS_ANSI and not ignore_colors and len(value) > 0:
if self.value_dep_fg_colors is not None:
if len(self.value_dep_fg_colors) > 0:
fg_color = self.value_dep_fg_colors.get(value, None)
if self.value_dep_bg_colors is not None:
if len(self.value_dep_bg_colors) > 0:
bg_color = self.value_dep_bg_colors.get(value, None)
if fg_color is not None or bg_color is not None:
value = self.printer.text(value, color=fg_color, bg_color=bg_color)
return value + right_padding
class Visualizer:
@staticmethod
def render_dep_tree(sent: Span, root_right: bool) -> List[str]:
"""
Returns an ASCII rendering of the document with a dependency tree for each sentence. The
dependency tree output for a given token has the same index within the output list of
strings as that token within the input document.
def render_dep_tree(sent, root_right: bool) -> List[str]:
"""
Returns an ASCII rendering of the document with a dependency tree for each sentence. The
dependency tree output for a given token has the same index within the output list of
strings as that token within the input document.
root_right: True if the tree should be rendered with the root on the right-hand side,
False if the tree should be rendered with the root on the left-hand side.
root_right: True if the tree should be rendered with the root on the right-hand side,
False if the tree should be rendered with the root on the left-hand side.
Algorithm adapted from https://github.com/KoichiYasuoka/deplacy
"""
Algorithm adapted from https://github.com/KoichiYasuoka/deplacy
"""
# Check sent is really a sentence
if sent.start != sent[0].sent.start or sent.end != sent[0].sent.end:
raise ValueError(f"Span is not a sentence: '{sent}'")
heads: List[Optional[int]] = []
for token in sent:
if token.dep_.lower() == "root" or token.head.i == token.i:
heads.append(None)
else:
heads.append(token.head.i - sent.start)
# Check there are no head references outside the sentence
heads_outside_sent = [
1 for h in heads if h is not None and (h < 0 or h > sent.end - sent.start)
]
if len(heads_outside_sent) > 0:
raise ValueError(f"Head reference outside sentence in sentence '{sent}'")
children_lists: List[List[int]] = [[] for _ in range(sent.end - sent.start)]
for child, head in enumerate(heads):
if head is not None:
children_lists[head].append(child)
all_ind_ord_by_col: List[int] = []
# start with the root column
inds_in_this_col = [i for i, h in enumerate(heads) if h is None]
while len(inds_in_this_col) > 0:
all_ind_ord_by_col = inds_in_this_col + all_ind_ord_by_col
inds_in_next_col = []
# The calculation order of the horizontal lengths of the children
# on either given side of a head must ensure that children
# closer to the head are processed first.
for ind_in_this_col in inds_in_this_col:
following_child_inds = [
i for i in children_lists[ind_in_this_col] if i > ind_in_this_col
]
inds_in_next_col.extend(following_child_inds)
preceding_child_inds = [
i for i in children_lists[ind_in_this_col] if i < ind_in_this_col
]
preceding_child_inds.reverse()
inds_in_next_col.extend(preceding_child_inds)
inds_in_this_col = inds_in_next_col
horiz_line_lens: List[int] = []
for i in range(sent.end - sent.start):
if heads[i] is None:
horiz_line_lens.append(-1)
elif len(children_lists[i]) == 0 and abs(cast(int, heads[i]) - i) == 1:
# governed by direct neighbour and has no children itself
horiz_line_lens.append(1)
else:
horiz_line_lens.append(0)
while 0 in horiz_line_lens:
for working_token_ind in (
i for i in all_ind_ord_by_col if horiz_line_lens[i] == 0
):
# render relation between this token and its head
first_ind_in_rel = min(
working_token_ind,
cast(int, heads[working_token_ind]),
)
second_ind_in_rel = max(
working_token_ind,
cast(int, heads[working_token_ind]),
)
# If this token has children, they will already have been rendered.
# The line needs to be one character longer than the longest of the
# children's lines.
if len(children_lists[working_token_ind]) > 0:
horiz_line_lens[working_token_ind] = (
max(
[
horiz_line_lens[i]
for i in children_lists[working_token_ind]
]
)
+ 1
)
else:
horiz_line_lens[working_token_ind] = 1
for inbetween_ind in (
i
for i in range(first_ind_in_rel + 1, second_ind_in_rel)
if horiz_line_lens[i] != 0
):
alt_ind: int
if (
inbetween_ind
in children_lists[cast(int, heads[working_token_ind])]
and inbetween_ind not in children_lists[working_token_ind]
):
alt_ind = horiz_line_lens[inbetween_ind]
else:
alt_ind = horiz_line_lens[inbetween_ind] + 1
if alt_ind > horiz_line_lens[working_token_ind]:
horiz_line_lens[working_token_ind] = alt_ind
max_horiz_line_len = max(horiz_line_lens)
char_matrix = [
[SPACE] * max_horiz_line_len * 2 for _ in range(sent.start, sent.end)
]
for working_token_ind in range(sent.end - sent.start):
head_token_ind = heads[working_token_ind]
if head_token_ind is None:
continue
first_ind_in_rel = min(working_token_ind, head_token_ind)
second_ind_in_rel = max(working_token_ind, head_token_ind)
char_horiz_line_len = 2 * horiz_line_lens[working_token_ind]
# Draw the corners of the relation
char_matrix[first_ind_in_rel][char_horiz_line_len - 1] |= (
HALF_HORIZONTAL_LINE + LOWER_HALF_VERTICAL_LINE
)
char_matrix[second_ind_in_rel][char_horiz_line_len - 1] |= (
HALF_HORIZONTAL_LINE + UPPER_HALF_VERTICAL_LINE
)
# Draw the horizontal line for the governing token
for working_horiz_pos in range(char_horiz_line_len - 1):
if char_matrix[head_token_ind][working_horiz_pos] != FULL_VERTICAL_LINE:
char_matrix[head_token_ind][
working_horiz_pos
] |= FULL_HORIZONTAL_LINE
# Draw the vertical line for the relation
for working_vert_pos in range(first_ind_in_rel + 1, second_ind_in_rel):
if (
char_matrix[working_vert_pos][char_horiz_line_len - 1]
!= FULL_HORIZONTAL_LINE
):
char_matrix[working_vert_pos][
char_horiz_line_len - 1
] |= FULL_VERTICAL_LINE
# Check sent is really a sentence
if sent.start != sent[0].sent.start or sent.end != sent[0].sent.end:
raise ValueError(f"Span is not a sentence: '{sent}'")
heads: List[Optional[int]] = []
for token in sent:
if token.dep_.lower() == "root" or token.head.i == token.i:
heads.append(None)
else:
heads.append(token.head.i - sent.start)
# Check there are no head references outside the sentence
heads_outside_sent = [
1 for h in heads if h is not None and (h < 0 or h > sent.end - sent.start)
]
if len(heads_outside_sent) > 0:
raise ValueError(f"Head reference outside sentence in sentence '{sent}'")
children_lists: List[List[int]] = [[] for _ in range(sent.end - sent.start)]
for child, head in enumerate(heads):
if head is not None:
children_lists[head].append(child)
all_ind_ord_by_col: List[int] = []
# start with the root column
inds_in_this_col = [i for i, h in enumerate(heads) if h is None]
while len(inds_in_this_col) > 0:
all_ind_ord_by_col = inds_in_this_col + all_ind_ord_by_col
inds_in_next_col = []
# The calculation order of the horizontal lengths of the children
# on either given side of a head must ensure that children
# closer to the head are processed first.
for ind_in_this_col in inds_in_this_col:
following_child_inds = [
i for i in children_lists[ind_in_this_col] if i > ind_in_this_col
]
inds_in_next_col.extend(following_child_inds)
preceding_child_inds = [
i for i in children_lists[ind_in_this_col] if i < ind_in_this_col
]
preceding_child_inds.reverse()
inds_in_next_col.extend(preceding_child_inds)
inds_in_this_col = inds_in_next_col
horiz_line_lens: List[int] = []
for i in range(sent.end - sent.start):
if heads[i] is None:
horiz_line_lens.append(-1)
elif len(children_lists[i]) == 0 and abs(cast(int, heads[i]) - i) == 1:
# governed by direct neighbour and has no children itself
horiz_line_lens.append(1)
else:
horiz_line_lens.append(0)
while 0 in horiz_line_lens:
for working_token_ind in (
i for i in range(sent.end - sent.start) if heads[i] is not None
i for i in all_ind_ord_by_col if horiz_line_lens[i] == 0
):
for working_horiz_pos in range(
2 * horiz_line_lens[working_token_ind] - 2, -1, -1
):
if (
(
char_matrix[working_token_ind][working_horiz_pos]
== FULL_VERTICAL_LINE
)
and working_horiz_pos > 1
and char_matrix[working_token_ind][working_horiz_pos - 2] == SPACE
):
# Cross over the existing vertical line, which is owing to a non-projective tree
continue
if char_matrix[working_token_ind][working_horiz_pos] != SPACE:
# Draw the arrowhead to the right of what is already there
char_matrix[working_token_ind][working_horiz_pos + 1] = ARROWHEAD
break
if working_horiz_pos == 0:
# Draw the arrowhead at the boundary of the diagram
char_matrix[working_token_ind][working_horiz_pos] = ARROWHEAD
else:
# Fill in the horizontal line for the governed token
char_matrix[working_token_ind][
working_horiz_pos
] |= FULL_HORIZONTAL_LINE
if root_right:
return [
"".join(
ROOT_RIGHT_CHARS[char_matrix[vert_pos][horiz_pos]]
for horiz_pos in range((max_horiz_line_len * 2))
)
for vert_pos in range(sent.end - sent.start)
]
else:
return [
"".join(
ROOT_LEFT_CHARS[char_matrix[vert_pos][horiz_pos]]
for horiz_pos in range((max_horiz_line_len * 2))
)[::-1]
for vert_pos in range(sent.end - sent.start)
]
def render(
self,
doc: Doc,
cols: List[AttributeFormat],
spacing: int = 2,
start_i: int = 0,
length: Optional[int] = None,
search_attr_name: Optional[str] = None,
search_attr_value: Optional[str] = None,
) -> str:
"""Renders a document as a table.
TODO: specify a specific portion of the document to display.
cols: the attribute formats of the columns to display.
tree_right and tree_left are magic values for the
attributes that render dependency trees where the
roots are on the left or right respectively.
spacing: the number of spaces between each column in the table.
start_i: the token index at which to start searching, or at
whose sentence to start rendering. Default: 0.
length: the number of tokens after *start_i* at whose sentence
to stop rendering. If *None*, the rest of the
document is rendered.
search_attr_name: the name of an attribute to search for in order to
determine where to start rendering, e.g. "lemma_",
or *None* if no search is to be carried out. If either
of *search_attr_name* and *search_attr_value* is *None*,
the behaviour is as if both were *None*.
search_attr_value: the value of an attribute to search for in order to
determine where to start rendering, e.g. "be",
or *None* if no search is to be carried out. If either
of *search_attr_name* and *search_attr_value* is *None*,
the behaviour is as if both were *None*.
"""
return_str = ""
if search_attr_name is not None and search_attr_value is not None:
adj_start_i = get_adjusted_start_i(
doc, start_i, cols, search_attr_name, search_attr_value
# render relation between this token and its head
first_ind_in_rel = min(
working_token_ind,
cast(int, heads[working_token_ind]),
)
else:
adj_start_i = start_i
if adj_start_i >= len(doc):
return return_str
end_i = len(doc) - 1
if length is not None:
end_i = min(end_i, adj_start_i + length)
elif start_i > 0 or (
search_attr_name is not None and search_attr_value is not None
):
end_i = adj_start_i
adj_start_i = doc[adj_start_i].sent.start
end_i = doc[end_i].sent.end
for sent in doc[adj_start_i:end_i].sents:
if "tree_right" in (c.attribute for c in cols):
tree_right = self.render_dep_tree(sent, True)
if "tree_left" in (c.attribute for c in cols):
tree_left = self.render_dep_tree(sent, False)
widths = []
for col in cols:
# get the values without any color codes
if col.attribute == "tree_left":
width = len(tree_left[0]) # type: ignore
elif col.attribute == "tree_right":
width = len(tree_right[0]) # type: ignore
else:
if len(sent) > 0:
width = max(
len(col.render(token, ignore_colors=True)) for token in sent
)
else:
width = 0
if col.max_width is not None:
width = min(width, col.max_width)
width = max(width, len(col.name))
widths.append(width)
data: List[List[str]] = []
for token_index, token in enumerate(sent):
inner_data: List[str] = []
for col_index, col in enumerate(cols):
if col.attribute == "tree_right":
inner_data.append(tree_right[token_index])
elif col.attribute == "tree_left":
inner_data.append(tree_left[token_index])
else:
inner_data.append(
col.render(token, right_pad_to_len=widths[col_index])
)
data.append(inner_data)
header: Optional[List[str]]
if len([1 for c in cols if len(c.name) > 0]) > 0:
header = [c.name for c in cols]
second_ind_in_rel = max(
working_token_ind,
cast(int, heads[working_token_ind]),
)
# If this token has children, they will already have been rendered.
# The line needs to be one character longer than the longest of the
# children's lines.
if len(children_lists[working_token_ind]) > 0:
horiz_line_lens[working_token_ind] = (
max([horiz_line_lens[i] for i in children_lists[working_token_ind]])
+ 1
)
else:
header = None
aligns = [c.aligns for c in cols]
fg_colors = [c.fg_color for c in cols]
bg_colors = [c.bg_color for c in cols]
return_str += (
wasabi.table(
data,
header=header,
divider=True,
aligns=aligns,
widths=widths,
fg_colors=fg_colors,
bg_colors=bg_colors,
spacing=spacing,
horiz_line_lens[working_token_ind] = 1
for inbetween_ind in (
i
for i in range(first_ind_in_rel + 1, second_ind_in_rel)
if horiz_line_lens[i] != 0
):
alt_ind: int
if (
inbetween_ind in children_lists[cast(int, heads[working_token_ind])]
and inbetween_ind not in children_lists[working_token_ind]
):
alt_ind = horiz_line_lens[inbetween_ind]
else:
alt_ind = horiz_line_lens[inbetween_ind] + 1
if alt_ind > horiz_line_lens[working_token_ind]:
horiz_line_lens[working_token_ind] = alt_ind
max_horiz_line_len = max(horiz_line_lens)
char_matrix = [
[SPACE] * max_horiz_line_len * 2 for _ in range(sent.start, sent.end)
]
for working_token_ind in range(sent.end - sent.start):
head_token_ind = heads[working_token_ind]
if head_token_ind is None:
continue
first_ind_in_rel = min(working_token_ind, head_token_ind)
second_ind_in_rel = max(working_token_ind, head_token_ind)
char_horiz_line_len = 2 * horiz_line_lens[working_token_ind]
# Draw the corners of the relation
char_matrix[first_ind_in_rel][char_horiz_line_len - 1] |= (
HALF_HORIZONTAL_LINE + LOWER_HALF_VERTICAL_LINE
)
char_matrix[second_ind_in_rel][char_horiz_line_len - 1] |= (
HALF_HORIZONTAL_LINE + UPPER_HALF_VERTICAL_LINE
)
# Draw the horizontal line for the governing token
for working_horiz_pos in range(char_horiz_line_len - 1):
if char_matrix[head_token_ind][working_horiz_pos] != FULL_VERTICAL_LINE:
char_matrix[head_token_ind][working_horiz_pos] |= FULL_HORIZONTAL_LINE
# Draw the vertical line for the relation
for working_vert_pos in range(first_ind_in_rel + 1, second_ind_in_rel):
if (
char_matrix[working_vert_pos][char_horiz_line_len - 1]
!= FULL_HORIZONTAL_LINE
):
char_matrix[working_vert_pos][
char_horiz_line_len - 1
] |= FULL_VERTICAL_LINE
for working_token_ind in (
i for i in range(sent.end - sent.start) if heads[i] is not None
):
for working_horiz_pos in range(
2 * horiz_line_lens[working_token_ind] - 2, -1, -1
):
if (
(
char_matrix[working_token_ind][working_horiz_pos]
== FULL_VERTICAL_LINE
)
+ "\n"
and working_horiz_pos > 1
and char_matrix[working_token_ind][working_horiz_pos - 2] == SPACE
):
# Cross over the existing vertical line, which is owing to a non-projective tree
continue
if char_matrix[working_token_ind][working_horiz_pos] != SPACE:
# Draw the arrowhead to the right of what is already there
char_matrix[working_token_ind][working_horiz_pos + 1] = ARROWHEAD
break
if working_horiz_pos == 0:
# Draw the arrowhead at the boundary of the diagram
char_matrix[working_token_ind][working_horiz_pos] = ARROWHEAD
else:
# Fill in the horizontal line for the governed token
char_matrix[working_token_ind][
working_horiz_pos
] |= FULL_HORIZONTAL_LINE
if root_right:
return [
"".join(
ROOT_RIGHT_CHARS[char_matrix[vert_pos][horiz_pos]]
for horiz_pos in range((max_horiz_line_len * 2))
)
for vert_pos in range(sent.end - sent.start)
]
else:
return [
"".join(
ROOT_LEFT_CHARS[char_matrix[vert_pos][horiz_pos]]
for horiz_pos in range((max_horiz_line_len * 2))
)[::-1]
for vert_pos in range(sent.end - sent.start)
]
def render_table(
doc,
cols: List[AttributeFormat],
spacing: int = 3,
search_attr_name: Optional[str] = None,
search_attr_value: Optional[str] = None,
start_i: int = 0,
length: Optional[int] = None,
) -> str:
"""Renders a document as a table, allowing the caller to specify various
display options.
doc: the document.
cols: the attribute formats of the columns to display.
tree_right and tree_left are magic values for the
attributes that render dependency trees where the
roots are on the left or right respectively.
spacing: the number of spaces between each column in the table.
search_attr_name: the name of an attribute to search for in order to
determine where to start rendering, e.g. "lemma_",
or *None* if no search is to be carried out. If either
of *search_attr_name* and *search_attr_value* is *None*,
the behaviour is as if both were *None*.
search_attr_value: the value of an attribute to search for in order to
determine where to start rendering, e.g. "be",
or *None* if no search is to be carried out. If either
of *search_attr_name* and *search_attr_value* is *None*,
the behaviour is as if both were *None*.
start_i: the token index at which to start searching, or at
whose sentence to start rendering. Default: 0.
length: the number of tokens after *start_i* at whose sentence
to stop rendering. If *None*, the rest of the
document is rendered.
"""
return_str = ""
if (
search_attr_name is not None
and search_attr_name not in ("tree_right", "tree_left")
and search_attr_value is not None
):
adj_start_i = _get_adjusted_start_i(
doc, start_i, cols, search_attr_name, search_attr_value
)
else:
adj_start_i = start_i
if adj_start_i >= len(doc):
return return_str
end_i = len(doc) - 1
if length is not None:
end_i = min(end_i, adj_start_i + length)
elif start_i > 0 or (
search_attr_name is not None and search_attr_value is not None
):
end_i = adj_start_i
adj_start_i = doc[adj_start_i].sent.start
end_i = doc[end_i].sent.end
for sent in doc[adj_start_i:end_i].sents:
if "tree_right" in (c.attribute for c in cols):
tree_right = render_dep_tree(sent, True)
if "tree_left" in (c.attribute for c in cols):
tree_left = render_dep_tree(sent, False)
widths = []
for col in cols:
# get the values without any color codes
if col.attribute == "tree_left":
width = len(tree_left[0]) # type: ignore
elif col.attribute == "tree_right":
width = len(tree_right[0]) # type: ignore
else:
if len(sent) > 0:
width = max(
len(col.render(token, ignore_colors=True)) for token in sent
)
else:
width = 0
if col.max_width is not None:
width = min(width, col.max_width)
width = max(width, len(col.name))
widths.append(width)
data: List[List[str]] = []
for token_index, token in enumerate(sent):
inner_data: List[str] = []
for col_index, col in enumerate(cols):
if col.attribute == "tree_right":
inner_data.append(tree_right[token_index])
elif col.attribute == "tree_left":
inner_data.append(tree_left[token_index])
else:
inner_data.append(
col.render(token, right_pad_to_len=widths[col_index])
)
data.append(inner_data)
header: Optional[List[str]]
if len([1 for c in cols if len(c.name) > 0]) > 0:
header = [c.name for c in cols]
else:
header = None
aligns = [c.aligns for c in cols]
fg_colors = [c.fg_color for c in cols]
bg_colors = [c.bg_color for c in cols]
return_str += (
wasabi.table(
data,
header=header,
divider=True,
aligns=aligns,
widths=widths,
fg_colors=fg_colors,
bg_colors=bg_colors,
spacing=spacing,
)
+ "\n"
)
return return_str
def get_token_value(token: Token, attribute: str) -> str:
def render_document(
doc,
search_attr_name: Optional[str] = None,
search_attr_value: Optional[str] = None,
*,
start_i: int = 0,
length: Optional[int] = None,
) -> str:
"""Renders a document as a table using standard display options.
doc: the document.
search_attr_name: the name of an attribute to search for in order to
determine where to start rendering, e.g. "lemma_",
or *None* if no search is to be carried out. If either
of *search_attr_name* and *search_attr_value* is *None*,
the behaviour is as if both were *None*.
search_attr_value: the value of an attribute to search for in order to
determine where to start rendering, e.g. "be",
or *None* if no search is to be carried out. If either
of *search_attr_name* and *search_attr_value* is *None*,
the behaviour is as if both were *None*.
start_i: the token index at which to start searching, or at
whose sentence to start rendering. Default: 0.
length: the number of tokens after *start_i* at whose sentence
to stop rendering. If *None*, the rest of the
document is rendered.
"""
cols = [
AttributeFormat("tree_left", name="tree", aligns="r", fg_color=4),
AttributeFormat("dep_", name="dep_"),
AttributeFormat("ent_type_", name="ent_type_"),
AttributeFormat("i", name="index", aligns="r"),
AttributeFormat("text", name="text", max_width=20),
AttributeFormat("lemma_", name="lemma_", max_width=20),
AttributeFormat("pos_", name="pos_"),
AttributeFormat("tag_", name="tag_"),
AttributeFormat("morph", name="morph_", max_width=60),
]
if search_attr_name is not None and search_attr_value is not None:
for col in cols:
if col.attribute == search_attr_name or col.name == search_attr_name:
col.value_dep_fg_colors[search_attr_value] = 1
return render_table(
doc=doc,
cols=cols,
spacing=3,
search_attr_name=search_attr_name,
search_attr_value=search_attr_value,
start_i=start_i,
length=length,
)
def _get_token_value(token, attribute: str) -> str:
"""
Get value *token.x.y.z*.
@ -422,11 +472,11 @@ def get_token_value(token: Token, attribute: str) -> str:
parts = attribute.split(".")
for part in parts[:-1]:
obj = getattr(obj, part)
return str(getattr(obj, parts[-1]))
return str(getattr(obj, parts[-1])).strip()
def get_adjusted_start_i(
doc: Doc,
def _get_adjusted_start_i(
doc,
start_i: int,
cols: List[AttributeFormat],
search_attr_name: str,
@ -447,7 +497,7 @@ def get_adjusted_start_i(
for col in cols:
if col.name == search_attr_name or col.attribute == search_attr_name:
for token in doc[start_i:]:
if get_token_value(token, col.attribute) == search_attr_value:
if _get_token_value(token, col.attribute) == search_attr_value:
return token.i
else:
return len(doc)