diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 88c7adfe3..0cd2727b7 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -1,5 +1,7 @@ import pytest from spacy.util import get_lang_class +from spacy.lang.en import English +from spacy.tokens import Doc def pytest_addoption(parser): @@ -390,3 +392,239 @@ def zh_tokenizer_pkuseg(): @pytest.fixture(scope="session") def hy_tokenizer(): return get_lang_class("hy")().tokenizer + + +@pytest.fixture +def tagged_doc(): + text = "Sarah's sister flew to Silicon Valley via London." + tags = ["NNP", "POS", "NN", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."] + pos = [ + "PROPN", + "PART", + "NOUN", + "VERB", + "ADP", + "PROPN", + "PROPN", + "ADP", + "PROPN", + "PUNCT", + ] + morphs = [ + "NounType=prop|Number=sing", + "Poss=yes", + "Number=sing", + "Tense=past|VerbForm=fin", + "", + "NounType=prop|Number=sing", + "NounType=prop|Number=sing", + "", + "NounType=prop|Number=sing", + "PunctType=peri", + ] + nlp = English() + doc = nlp(text) + for i in range(len(tags)): + doc[i].tag_ = tags[i] + doc[i].pos_ = pos[i] + doc[i].set_morph(morphs[i]) + if i > 0: + doc[i].is_sent_start = False + return doc + + +@pytest.fixture +def fully_featured_doc_one_sentence(en_vocab): + words = [ + "Sarah", + "'s", + "sister", + "flew", + "to", + "Silicon", + "Valley", + "via", + "London", + ".", + ] + lemmas = [ + "sarah", + "'s", + "sister", + "fly", + "to", + "silicon", + "valley", + "via", + "london", + ".", + ] + spaces = [False, True, True, True, True, True, True, True, False, False] + tags = ["NNP", "POS", "NN", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."] + pos = [ + "PROPN", + "PART", + "NOUN", + "VERB", + "ADP", + "PROPN", + "PROPN", + "ADP", + "PROPN", + "PUNCT", + ] + morphs = [ + "NounType=prop|Number=sing", + "Poss=yes", + "Number=sing", + "Tense=past|VerbForm=fin", + "", + "NounType=prop|Number=sing", + "NounType=prop|Number=sing", + "", + "NounType=prop|Number=sing", + "PunctType=peri", + ] + heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 3] + deps = [ + "poss", + "case", + "nsubj", + "ROOT", + "prep", + "compound", + "pobj", + "prep", + "pobj", + "punct", + ] + ent_types = ["PERSON", "", "", "", "", "GPE", "GPE", "", "GPE", ""] + doc = Doc( + en_vocab, + words=words, + lemmas=lemmas, + spaces=spaces, + heads=heads, + deps=deps, + morphs=morphs, + ) + for i in range(len(tags)): + doc[i].tag_ = tags[i] + doc[i].pos_ = pos[i] + doc[i].ent_type_ = ent_types[i] + return doc + + +@pytest.fixture +def fully_featured_doc_two_sentences(en_vocab): + words = [ + "Sarah", + "'s", + "sister", + "flew", + "to", + "Silicon", + "Valley", + "via", + "London", + ".", + "She", + "loved", + "it", + "." + ] + lemmas = [ + "sarah", + "'s", + "sister", + "fly", + "to", + "silicon", + "valley", + "via", + "london", + ".", + "she", + "love", + "it", + "." + ] + spaces = [False, True, True, True, True, True, True, True, False, False, True, True, False, False] + pos = [ + "PROPN", + "PART", + "NOUN", + "VERB", + "ADP", + "PROPN", + "PROPN", + "ADP", + "PROPN", + "PUNCT", + "PRON", + "VERB", + "PRON", + "PUNCT" + ] + tags = ["NNP", "POS", "NN", "VBD", "IN", "NNP", "NNP", "IN", "NNP", ".", "PRP", "VBD", "PRP", "."] + morphs = [ + "NounType=prop|Number=sing", + "Poss=yes", + "Number=sing", + "Tense=past|VerbForm=fin", + "", + "NounType=prop|Number=sing", + "NounType=prop|Number=sing", + "", + "NounType=prop|Number=sing", + "PunctType=peri", + "Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs", + "Tense=Past|VerbForm=Fin", + "Case=Acc|Gender=Neut|Number=Sing|Person=3|PronType=Prs", + "PunctType=peri", + ] + heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 3, 11, 11, 11, 11] + deps = [ + "poss", + "case", + "nsubj", + "ROOT", + "prep", + "compound", + "pobj", + "prep", + "pobj", + "punct", + "nsubj", + "ROOT", + "dobj", + "punct", + ] + ent_types = ["PERSON", "", "", "", "", "GPE", "GPE", "", "GPE", "", "", "", "", ""] + doc = Doc( + en_vocab, + words=words, + lemmas=lemmas, + spaces=spaces, + heads=heads, + deps=deps, + morphs=morphs, + ) + for i in range(len(tags)): + doc[i].tag_ = tags[i] + doc[i].pos_ = pos[i] + doc[i].ent_type_ = ent_types[i] + return doc + + +@pytest.fixture +def sented_doc(): + text = "One sentence. Two sentences. Three sentences." + nlp = English() + doc = nlp(text) + for i in range(len(doc)): + if i % 3 == 0: + doc[i].is_sent_start = True + else: + doc[i].is_sent_start = False + return doc diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index 6e15fa2de..cdc706ebd 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -43,58 +43,6 @@ test_ner_apple = [ ] -@pytest.fixture -def tagged_doc(): - text = "Sarah's sister flew to Silicon Valley via London." - tags = ["NNP", "POS", "NN", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."] - pos = [ - "PROPN", - "PART", - "NOUN", - "VERB", - "ADP", - "PROPN", - "PROPN", - "ADP", - "PROPN", - "PUNCT", - ] - morphs = [ - "NounType=prop|Number=sing", - "Poss=yes", - "Number=sing", - "Tense=past|VerbForm=fin", - "", - "NounType=prop|Number=sing", - "NounType=prop|Number=sing", - "", - "NounType=prop|Number=sing", - "PunctType=peri", - ] - nlp = English() - doc = nlp(text) - for i in range(len(tags)): - doc[i].tag_ = tags[i] - doc[i].pos_ = pos[i] - doc[i].set_morph(morphs[i]) - if i > 0: - doc[i].is_sent_start = False - return doc - - -@pytest.fixture -def sented_doc(): - text = "One sentence. Two sentences. Three sentences." - nlp = English() - doc = nlp(text) - for i in range(len(doc)): - if i % 3 == 0: - doc[i].is_sent_start = True - else: - doc[i].is_sent_start = False - return doc - - def test_tokenization(sented_doc): scorer = Scorer() gold = {"sent_starts": [t.sent_start for t in sented_doc]} diff --git a/spacy/tests/test_visualization.py b/spacy/tests/test_visualization.py index 32c5eb74a..af68f7e03 100644 --- a/spacy/tests/test_visualization.py +++ b/spacy/tests/test_visualization.py @@ -1,10 +1,14 @@ import pytest import deplacy -from spacy.visualization import Visualizer -from spacy.tokens import Span, Doc +from wasabi.util import supports_ansi +from spacy.visualization import AttributeFormat, Visualizer +from spacy.tokens import Span, Doc, Token -def test_dependency_tree_basic(en_vocab): +SUPPORTS_ANSI = supports_ansi() + + +def test_visualization_dependency_tree_basic(en_vocab): """Test basic dependency tree display.""" doc = Doc( en_vocab, @@ -48,7 +52,7 @@ def test_dependency_tree_basic(en_vocab): ] -def test_dependency_tree_non_initial_sentence(en_vocab): +def test_visualization_dependency_tree_non_initial_sentence(en_vocab): """Test basic dependency tree display.""" doc = Doc( en_vocab, @@ -95,8 +99,8 @@ def test_dependency_tree_non_initial_sentence(en_vocab): ] -def test_dependency_tree_non_projective(en_vocab): - """Test dependency tree display with a non-prejective dependency.""" +def test_visualization_dependency_tree_non_projective(en_vocab): + """Test dependency tree display with a non-projective dependency.""" doc = Doc( en_vocab, words=[ @@ -114,8 +118,6 @@ def test_dependency_tree_non_projective(en_vocab): deps=["dep"] * 9, ) dep_tree = Visualizer.render_dependency_tree(doc[0 : len(doc)], True) - for line in dep_tree: - print(line) assert dep_tree == [ "<╗ ", "═╩═══╗", @@ -141,7 +143,7 @@ def test_dependency_tree_non_projective(en_vocab): ] -def test_dependency_tree_input_not_span(en_vocab): +def test_visualization_dependency_tree_input_not_span(en_vocab): """Test dependency tree display behaviour when the input is not a Span.""" doc = Doc( en_vocab, @@ -163,7 +165,8 @@ def test_dependency_tree_input_not_span(en_vocab): with pytest.raises(AssertionError): Visualizer.render_dependency_tree(doc[1:3], True) -def test_dependency_tree_highly_nonprojective(en_vocab): + +def test_visualization_dependency_tree_highly_nonprojective(en_vocab): """Test a highly non-projective tree (colloquial Polish).""" doc = Doc( en_vocab, @@ -204,3 +207,337 @@ def test_dependency_tree_highly_nonprojective(en_vocab): ] +def test_visualization_get_entity_native_attribute_int(en_vocab): + doc = Doc( + en_vocab, + words=[ + "I", + "saw", + "a", + "horse", + "yesterday", + "that", + "was", + "injured", + ".", + ], + heads=[1, None, 3, 1, 1, 7, 7, 3, 1], + deps=["dep"] * 9, + ) + assert Visualizer().get_entity(doc[2], "head.i") == "3" + + +def test_visualization_get_entity_native_attribute_str(en_vocab): + doc = Doc( + en_vocab, + words=[ + "I", + "saw", + "a", + "horse", + "yesterday", + "that", + "was", + "injured", + ".", + ], + heads=[1, None, 3, 1, 1, 7, 7, 3, 1], + deps=["dep"] * 9, + ) + + assert Visualizer().get_entity(doc[2], "dep_") == "dep" + + +def test_visualization_get_entity_colors(en_vocab): + doc = Doc( + en_vocab, + words=[ + "I", + "saw", + "a", + "horse", + "yesterday", + "that", + "was", + "injured", + ".", + ], + heads=[1, None, 3, 1, 1, 7, 7, 3, 1], + deps=["dep"] * 9, + ) + + assert ( + Visualizer().get_entity( + doc[2], + "dep_", + value_dependent_fg_colors={"dep": 2}, + value_dependent_bg_colors={"dep": 11}, + ) + == "\x1b[38;5;2;48;5;11mdep\x1b[0m" + if supports_ansi + else "dep" + ) + + +def test_visualization_get_entity_colors_only_fg(en_vocab): + doc = Doc( + en_vocab, + words=[ + "I", + "saw", + "a", + "horse", + "yesterday", + "that", + "was", + "injured", + ".", + ], + heads=[1, None, 3, 1, 1, 7, 7, 3, 1], + deps=["dep"] * 9, + ) + + assert ( + Visualizer().get_entity(doc[2], "dep_", value_dependent_fg_colors={"dep": 2}) + == "\x1b[38;5;2mdep\x1b[0m" + if supports_ansi + else "dep" + ) + + +def test_visualization_get_entity_colors_only_bg(en_vocab): + doc = Doc( + en_vocab, + words=[ + "I", + "saw", + "a", + "horse", + "yesterday", + "that", + "was", + "injured", + ".", + ], + heads=[1, None, 3, 1, 1, 7, 7, 3, 1], + deps=["dep"] * 9, + ) + + assert ( + Visualizer().get_entity(doc[2], "dep_", value_dependent_bg_colors={"dep": 11}) + == "\x1b[48;5;11mdep\x1b[0m" + if supports_ansi + else "dep" + ) + + +def test_visualization_get_entity_native_attribute_missing(en_vocab): + doc = Doc( + en_vocab, + words=[ + "I", + "saw", + "a", + "horse", + "yesterday", + "that", + "was", + "injured", + ".", + ], + heads=[1, None, 3, 1, 1, 7, 7, 3, 1], + deps=["dep"] * 9, + ) + with pytest.raises(AttributeError): + Visualizer().get_entity(doc[2], "depp") + + +def test_visualization_get_entity_custom_attribute_str(en_vocab): + doc = Doc( + en_vocab, + words=[ + "I", + "saw", + "a", + "horse", + "yesterday", + "that", + "was", + "injured", + ".", + ], + heads=[1, None, 3, 1, 1, 7, 7, 3, 1], + deps=["dep"] * 9, + ) + Token.set_extension("test", default="tested", force=True) + assert Visualizer().get_entity(doc[2], "_.test") == "tested" + + +def test_visualization_get_entity_nested_custom_attribute_str(en_vocab): + doc = Doc( + en_vocab, + words=[ + "I", + "saw", + "a", + "horse", + "yesterday", + "that", + "was", + "injured", + ".", + ], + heads=[1, None, 3, 1, 1, 7, 7, 3, 1], + deps=["dep"] * 9, + ) + + class Test: + def __init__(self): + self.inner_test = "tested" + + Token.set_extension("test", default=Test(), force=True) + assert Visualizer().get_entity(doc[2], "_.test.inner_test") == "tested" + + +def test_visualization_get_entity_custom_attribute_missing(en_vocab): + doc = Doc( + en_vocab, + words=[ + "I", + "saw", + "a", + "horse", + "yesterday", + "that", + "was", + "injured", + ".", + ], + heads=[1, None, 3, 1, 1, 7, 7, 3, 1], + deps=["dep"] * 9, + ) + with pytest.raises(AttributeError): + Visualizer().get_entity(doc[2], "_.depp") + + +def test_visualization_minimal_render_table_one_sentence( + fully_featured_doc_one_sentence, +): + formats = [ + AttributeFormat("tree_left"), + AttributeFormat("dep_"), + AttributeFormat("text"), + AttributeFormat("lemma_"), + AttributeFormat("pos_"), + AttributeFormat("tag_"), + AttributeFormat("morph"), + AttributeFormat("ent_type_"), + ] + assert ( + Visualizer().render_table(fully_featured_doc_one_sentence, formats).strip() + == """ + ╔>╔═ poss Sarah sarah PROPN NNP NounType=prop|Number=sing PERSON + ║ ╚> case 's 's PART POS Poss=yes +╔>╚═══ nsubj sister sister NOUN NN Number=sing +╠═════ ROOT flew fly VERB VBD Tense=past|VerbForm=fin +╠>╔═══ prep to to ADP IN +║ ║ ╔> compound Silicon silicon PROPN NNP NounType=prop|Number=sing GPE +║ ╚>╚═ pobj Valley valley PROPN NNP NounType=prop|Number=sing GPE +╠══>╔═ prep via via ADP IN +║ ╚> pobj London london PROPN NNP NounType=prop|Number=sing GPE +╚════> punct . . PUNCT . PunctType=peri + """.strip() + ) + + +def test_visualization_minimal_render_table_two_sentences( + fully_featured_doc_two_sentences, +): + formats = [ + AttributeFormat("tree_left"), + AttributeFormat("dep_"), + AttributeFormat("text"), + AttributeFormat("lemma_"), + AttributeFormat("pos_"), + AttributeFormat("tag_"), + AttributeFormat("morph"), + AttributeFormat("ent_type_"), + ] + + assert ( + Visualizer().render_table(fully_featured_doc_two_sentences, formats).strip() + == """ + ╔>╔═ poss Sarah sarah PROPN NNP NounType=prop|Number=sing PERSON + ║ ╚> case 's 's PART POS Poss=yes +╔>╚═══ nsubj sister sister NOUN NN Number=sing +╠═════ ROOT flew fly VERB VBD Tense=past|VerbForm=fin +╠>╔═══ prep to to ADP IN +║ ║ ╔> compound Silicon silicon PROPN NNP NounType=prop|Number=sing GPE +║ ╚>╚═ pobj Valley valley PROPN NNP NounType=prop|Number=sing GPE +╠══>╔═ prep via via ADP IN +║ ╚> pobj London london PROPN NNP NounType=prop|Number=sing GPE +╚════> punct . . PUNCT . PunctType=peri + + +╔> nsubj She she PRON PRP Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs +╠═ ROOT loved love VERB VBD Tense=Past|VerbForm=Fin +╠> dobj it it PRON PRP Case=Acc|Gender=Neut|Number=Sing|Person=3|PronType=Prs +╚> punct . . PUNCT . PunctType=peri +""".strip() + ) + + +def test_visualization_rich_render_table_one_sentence( + fully_featured_doc_one_sentence, +): + formats = [ + AttributeFormat("tree_left", name="tree", aligns="r", fg_color=2), + AttributeFormat("dep_", name="dep", fg_color=2), + AttributeFormat("i", name="index", aligns="r"), + AttributeFormat("text", name="text"), + AttributeFormat("lemma_", name="lemma"), + AttributeFormat("pos_", name="pos", fg_color=100), + AttributeFormat("tag_", name="tag", fg_color=100), + AttributeFormat("morph", name="morph", fg_color=100, max_width=15), + AttributeFormat( + "ent_type_", + name="ent", + fg_color=196, + value_dependent_fg_colors={"PERSON": 50}, + value_dependent_bg_colors={"PERSON": 12}, + ), + ] + assert ( + Visualizer().render_table(fully_featured_doc_one_sentence, formats) + == "\n\x1b[38;5;2m tree\x1b[0m \x1b[38;5;2mdep \x1b[0m index text lemma \x1b[38;5;100mpos \x1b[0m \x1b[38;5;100mtag\x1b[0m \x1b[38;5;100mmorph \x1b[0m \x1b[38;5;196ment \x1b[0m\n\x1b[38;5;2m------\x1b[0m \x1b[38;5;2m--------\x1b[0m ----- ------- ------- \x1b[38;5;100m-----\x1b[0m \x1b[38;5;100m---\x1b[0m \x1b[38;5;100m---------------\x1b[0m \x1b[38;5;196m------\x1b[0m\n\x1b[38;5;2m ╔>╔═\x1b[0m \x1b[38;5;2mposs \x1b[0m 0 Sarah sarah \x1b[38;5;100mPROPN\x1b[0m \x1b[38;5;100mNNP\x1b[0m \x1b[38;5;100mNounType=prop|N\x1b[0m \x1b[38;5;196m\x1b[38;5;50;48;5;12mPERSON\x1b[0m\x1b[0m\n\x1b[38;5;2m ║ ╚>\x1b[0m \x1b[38;5;2mcase \x1b[0m 1 's 's \x1b[38;5;100mPART \x1b[0m \x1b[38;5;100mPOS\x1b[0m \x1b[38;5;100mPoss=yes \x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m╔>╚═══\x1b[0m \x1b[38;5;2mnsubj \x1b[0m 2 sister sister \x1b[38;5;100mNOUN \x1b[0m \x1b[38;5;100mNN \x1b[0m \x1b[38;5;100mNumber=sing \x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m╠═════\x1b[0m \x1b[38;5;2mROOT \x1b[0m 3 flew fly \x1b[38;5;100mVERB \x1b[0m \x1b[38;5;100mVBD\x1b[0m \x1b[38;5;100mTense=past|Verb\x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m╠>╔═══\x1b[0m \x1b[38;5;2mprep \x1b[0m 4 to to \x1b[38;5;100mADP \x1b[0m \x1b[38;5;100mIN \x1b[0m \x1b[38;5;100m \x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m║ ║ ╔>\x1b[0m \x1b[38;5;2mcompound\x1b[0m 5 Silicon silicon \x1b[38;5;100mPROPN\x1b[0m \x1b[38;5;100mNNP\x1b[0m \x1b[38;5;100mNounType=prop|N\x1b[0m \x1b[38;5;196mGPE \x1b[0m\n\x1b[38;5;2m║ ╚>╚═\x1b[0m \x1b[38;5;2mpobj \x1b[0m 6 Valley valley \x1b[38;5;100mPROPN\x1b[0m \x1b[38;5;100mNNP\x1b[0m \x1b[38;5;100mNounType=prop|N\x1b[0m \x1b[38;5;196mGPE \x1b[0m\n\x1b[38;5;2m╠══>╔═\x1b[0m \x1b[38;5;2mprep \x1b[0m 7 via via \x1b[38;5;100mADP \x1b[0m \x1b[38;5;100mIN \x1b[0m \x1b[38;5;100m \x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m║ ╚>\x1b[0m \x1b[38;5;2mpobj \x1b[0m 8 London london \x1b[38;5;100mPROPN\x1b[0m \x1b[38;5;100mNNP\x1b[0m \x1b[38;5;100mNounType=prop|N\x1b[0m \x1b[38;5;196mGPE \x1b[0m\n\x1b[38;5;2m╚════>\x1b[0m \x1b[38;5;2mpunct \x1b[0m 9 . . \x1b[38;5;100mPUNCT\x1b[0m \x1b[38;5;100m. \x1b[0m \x1b[38;5;100mPunctType=peri \x1b[0m \x1b[38;5;196m \x1b[0m\n\n" + if supports_ansi + else "\n tree dep index text lemma pos tag morph ent \n------ -------- ----- ------- ------- ----- --- ------------------------- ------\n ╔>╔═ poss 0 Sarah sarah PROPN NNP NounType=prop|Number=sing PERSON\n ║ ╚> case 1 's 's PART POS Poss=yes \n╔>╚═══ nsubj 2 sister sister NOUN NN Number=sing \n╠═════ ROOT 3 flew fly VERB VBD Tense=past|VerbForm=fin \n╠>╔═══ prep 4 to to ADP IN \n║ ║ ╔> compound 5 Silicon silicon PROPN NNP NounType=prop|Number=sing GPE \n║ ╚>╚═ pobj 6 Valley valley PROPN NNP NounType=prop|Number=sing GPE \n╠══>╔═ prep 7 via via ADP IN \n║ ╚> pobj 8 London london PROPN NNP NounType=prop|Number=sing GPE \n╚════> punct 9 . . PUNCT . PunctType=peri \n\n" + ) + + +def test_visualization_rich_render_table_two_sentences( + fully_featured_doc_two_sentences, +): + formats = [ + AttributeFormat("tree_left", name="tree", aligns="r", fg_color=2), + AttributeFormat("dep_", name="dep", fg_color=2), + AttributeFormat("i", name="index", aligns="r"), + AttributeFormat("text", name="text"), + AttributeFormat("lemma_", name="lemma"), + AttributeFormat("pos_", name="pos", fg_color=100), + AttributeFormat("tag_", name="tag", fg_color=100), + AttributeFormat("morph", name="morph", fg_color=100, max_width=15), + AttributeFormat( + "ent_type_", + name="ent", + fg_color=196, + value_dependent_fg_colors={"PERSON": 50}, + value_dependent_bg_colors={"PERSON": 12}, + ), + ] + assert ( + Visualizer().render_table(fully_featured_doc_two_sentences, formats) + == "\n\x1b[38;5;2m tree\x1b[0m \x1b[38;5;2mdep \x1b[0m index text lemma \x1b[38;5;100mpos \x1b[0m \x1b[38;5;100mtag\x1b[0m \x1b[38;5;100mmorph \x1b[0m \x1b[38;5;196ment \x1b[0m\n\x1b[38;5;2m------\x1b[0m \x1b[38;5;2m--------\x1b[0m ----- ------- ------- \x1b[38;5;100m-----\x1b[0m \x1b[38;5;100m---\x1b[0m \x1b[38;5;100m---------------\x1b[0m \x1b[38;5;196m------\x1b[0m\n\x1b[38;5;2m ╔>╔═\x1b[0m \x1b[38;5;2mposs \x1b[0m 0 Sarah sarah \x1b[38;5;100mPROPN\x1b[0m \x1b[38;5;100mNNP\x1b[0m \x1b[38;5;100mNounType=prop|N\x1b[0m \x1b[38;5;196m\x1b[38;5;50;48;5;12mPERSON\x1b[0m\x1b[0m\n\x1b[38;5;2m ║ ╚>\x1b[0m \x1b[38;5;2mcase \x1b[0m 1 's 's \x1b[38;5;100mPART \x1b[0m \x1b[38;5;100mPOS\x1b[0m \x1b[38;5;100mPoss=yes \x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m╔>╚═══\x1b[0m \x1b[38;5;2mnsubj \x1b[0m 2 sister sister \x1b[38;5;100mNOUN \x1b[0m \x1b[38;5;100mNN \x1b[0m \x1b[38;5;100mNumber=sing \x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m╠═════\x1b[0m \x1b[38;5;2mROOT \x1b[0m 3 flew fly \x1b[38;5;100mVERB \x1b[0m \x1b[38;5;100mVBD\x1b[0m \x1b[38;5;100mTense=past|Verb\x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m╠>╔═══\x1b[0m \x1b[38;5;2mprep \x1b[0m 4 to to \x1b[38;5;100mADP \x1b[0m \x1b[38;5;100mIN \x1b[0m \x1b[38;5;100m \x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m║ ║ ╔>\x1b[0m \x1b[38;5;2mcompound\x1b[0m 5 Silicon silicon \x1b[38;5;100mPROPN\x1b[0m \x1b[38;5;100mNNP\x1b[0m \x1b[38;5;100mNounType=prop|N\x1b[0m \x1b[38;5;196mGPE \x1b[0m\n\x1b[38;5;2m║ ╚>╚═\x1b[0m \x1b[38;5;2mpobj \x1b[0m 6 Valley valley \x1b[38;5;100mPROPN\x1b[0m \x1b[38;5;100mNNP\x1b[0m \x1b[38;5;100mNounType=prop|N\x1b[0m \x1b[38;5;196mGPE \x1b[0m\n\x1b[38;5;2m╠══>╔═\x1b[0m \x1b[38;5;2mprep \x1b[0m 7 via via \x1b[38;5;100mADP \x1b[0m \x1b[38;5;100mIN \x1b[0m \x1b[38;5;100m \x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m║ ╚>\x1b[0m \x1b[38;5;2mpobj \x1b[0m 8 London london \x1b[38;5;100mPROPN\x1b[0m \x1b[38;5;100mNNP\x1b[0m \x1b[38;5;100mNounType=prop|N\x1b[0m \x1b[38;5;196mGPE \x1b[0m\n\x1b[38;5;2m╚════>\x1b[0m \x1b[38;5;2mpunct \x1b[0m 9 . . \x1b[38;5;100mPUNCT\x1b[0m \x1b[38;5;100m. \x1b[0m \x1b[38;5;100mPunctType=peri \x1b[0m \x1b[38;5;196m \x1b[0m\n\n\n\x1b[38;5;2mtree\x1b[0m \x1b[38;5;2mdep \x1b[0m index text lemma \x1b[38;5;100mpos \x1b[0m \x1b[38;5;100mtag\x1b[0m \x1b[38;5;100mmorph \x1b[0m \x1b[38;5;196ment\x1b[0m\n\x1b[38;5;2m----\x1b[0m \x1b[38;5;2m-----\x1b[0m ----- ----- ----- \x1b[38;5;100m-----\x1b[0m \x1b[38;5;100m---\x1b[0m \x1b[38;5;100m---------------\x1b[0m \x1b[38;5;196m---\x1b[0m\n\x1b[38;5;2m ╔>\x1b[0m \x1b[38;5;2mnsubj\x1b[0m 10 She she \x1b[38;5;100mPRON \x1b[0m \x1b[38;5;100mPRP\x1b[0m \x1b[38;5;100mCase=Nom|Gender\x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m ╠═\x1b[0m \x1b[38;5;2mROOT \x1b[0m 11 loved love \x1b[38;5;100mVERB \x1b[0m \x1b[38;5;100mVBD\x1b[0m \x1b[38;5;100mTense=Past|Verb\x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m ╠>\x1b[0m \x1b[38;5;2mdobj \x1b[0m 12 it it \x1b[38;5;100mPRON \x1b[0m \x1b[38;5;100mPRP\x1b[0m \x1b[38;5;100mCase=Acc|Gender\x1b[0m \x1b[38;5;196m \x1b[0m\n\x1b[38;5;2m ╚>\x1b[0m \x1b[38;5;2mpunct\x1b[0m 13 . . \x1b[38;5;100mPUNCT\x1b[0m \x1b[38;5;100m. \x1b[0m \x1b[38;5;100mPunctType=peri \x1b[0m \x1b[38;5;196m \x1b[0m\n\n" + if supports_ansi + else "\n tree dep index text lemma pos tag morph ent \n------ -------- ----- ------- ------- ----- --- ------------------------- ------\n ╔>╔═ poss 0 Sarah sarah PROPN NNP NounType=prop|Number=sing PERSON\n ║ ╚> case 1 's 's PART POS Poss=yes \n╔>╚═══ nsubj 2 sister sister NOUN NN Number=sing \n╠═════ ROOT 3 flew fly VERB VBD Tense=past|VerbForm=fin \n╠>╔═══ prep 4 to to ADP IN \n║ ║ ╔> compound 5 Silicon silicon PROPN NNP NounType=prop|Number=sing GPE \n║ ╚>╚═ pobj 6 Valley valley PROPN NNP NounType=prop|Number=sing GPE \n╠══>╔═ prep 7 via via ADP IN \n║ ╚> pobj 8 London london PROPN NNP NounType=prop|Number=sing GPE \n╚════> punct 9 . . PUNCT . PunctType=peri \n\n\ntree dep index text lemma pos tag morph ent\n---- ----- ----- ----- ----- ----- --- ------------------------------------------------------ ---\n ╔> nsubj 10 She she PRON PRP Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs \n ╠═ ROOT 11 loved love VERB VBD Tense=Past|VerbForm=Fin \n ╠> dobj 12 it it PRON PRP Case=Acc|Gender=Neut|Number=Sing|Person=3|PronType=Prs \n ╚> punct 13 . . PUNCT . PunctType=peri \n\n" + ) diff --git a/spacy/visualization.py b/spacy/visualization.py index 19f06ac4f..e03e77bbd 100644 --- a/spacy/visualization.py +++ b/spacy/visualization.py @@ -1,7 +1,34 @@ +from os import linesep, truncate +from typing import Union +import wasabi from spacy.tests.lang.ko.test_tokenizer import FULL_TAG_TESTS -from spacy.tokens import Span +from spacy.tokens import Span, Token, Doc from spacy.util import working_dir + +class AttributeFormat: + def __init__( + self, + attribute: str, + *, + name: str = "", + aligns: str = "l", + max_width: int = None, + fg_color: Union[str, int] = None, + bg_color: Union[str, int] = None, + value_dependent_fg_colors: dict[str, Union[str, int]] = None, + value_dependent_bg_colors: dict[str, Union[str, int]] = None, + ): + self.attribute = attribute + self.name = name + self.aligns = aligns + self.max_width = max_width + self.fg_color = fg_color + self.bg_color = bg_color + self.value_dependent_fg_colors = value_dependent_fg_colors + self.value_dependent_bg_colors = value_dependent_bg_colors + + SPACE = 0 HALF_HORIZONTAL_LINE = 1 # the half is the half further away from the root FULL_HORIZONTAL_LINE = 3 @@ -37,12 +64,11 @@ ROOT_LEFT_CHARS = { } -class TableColumn: - def __init__(self, entity: str, width: int, overflow_strategy: str = "truncate"): - pass - - class Visualizer: + + def __init__(self): + self.printer = wasabi.Printer(no_print=True) + @staticmethod def render_dependency_tree(sent: Span, root_right: bool) -> list[str]: """ @@ -65,6 +91,17 @@ class Visualizer: else token.head.i - sent.start for token in sent ] + # Check there are no head references outside the sentence + assert ( + len( + [ + head + for head in heads + if head is not None and (head < 0 or head > sent.end - sent.start) + ] + ) + == 0 + ) children_lists = [[] for _ in range(sent.end - sent.start)] for child, head in enumerate(heads): if head is not None: @@ -257,3 +294,85 @@ class Visualizer: )[::-1] for vertical_position in range(sent.end - sent.start) ] + + def get_entity( + self, + token: Token, + entity_name: str, + *, + value_dependent_fg_colors: dict[str : Union[str, int]] = None, + value_dependent_bg_colors: dict[str : Union[str, int]] = None, + truncate_at_width: int = None + ) -> str: + obj = token + parts = entity_name.split(".") + for part in parts[:-1]: + obj = getattr(obj, part) + value = str(getattr(obj, parts[-1])) + if truncate_at_width is not None: + value = value[:truncate_at_width] + fg_color = value_dependent_fg_colors.get(value, None) if value_dependent_fg_colors is not None else None + bg_color = value_dependent_bg_colors.get(value, None) if value_dependent_bg_colors is not None else None + if fg_color is not None or bg_color is not None: + value = self.printer.text(value, color=fg_color, bg_color=bg_color) + return value + + def render_table( + self, doc: Doc, columns: list[AttributeFormat], spacing: int = 3 + ) -> str: + return_string = "" + for sent in doc.sents: + if "tree_right" in (c.attribute for c in columns): + tree_right = self.render_dependency_tree(sent, True) + if "tree_left" in (c.attribute for c in columns): + tree_left = self.render_dependency_tree(sent, False) + widths = [] + for column in columns: + # get the values without any color codes + if column.attribute == 'tree_left': + width = len(tree_left[0]) + elif column.attribute == 'tree_right': + width = len(tree_right[0]) + else: + width = max(len(self.get_entity(token, column.attribute)) for token in sent) + if column.max_width is not None: + width = min(width, column.max_width) + width = max(width, len(column.name)) + widths.append(width) + data = [ + [ + tree_right[token_index] + if column.attribute == "tree_right" + else tree_left[token_index] + if column.attribute == "tree_left" + else self.get_entity( + token, + column.attribute, + value_dependent_fg_colors=column.value_dependent_fg_colors, + value_dependent_bg_colors=column.value_dependent_bg_colors, + truncate_at_width=widths[column_index] + ) + for column_index, column in enumerate(columns) + ] + for token_index, token in enumerate(sent) + ] + if len([1 for c in columns if len(c.name) > 0]) > 0: + header = [c.name for c in columns] + else: + header = None + aligns = [c.aligns for c in columns] + fg_colors = [c.fg_color for c in columns] + bg_colors = [c.bg_color for c in columns] + return_string += ( + wasabi.table( + data, + header=header, + divider=True, + aligns=aligns, + widths=widths, + fg_colors=fg_colors, + bg_colors=bg_colors, + ) + + linesep + ) + return return_string