Add render_instances function

2025-08-04 12:20:20 +03:00 · 2021-12-08 19:24:32 +01:00 · 2021-12-08 19:24:32 +01:00 · ed788c5def
commit ed788c5def
parent bd00611259
2 changed files with 250 additions and 15 deletions
--- a/spacy/tests/test_visualization.py
+++ b/spacy/tests/test_visualization.py
@ -486,6 +486,38 @@ def test_visualization_minimal_render_table_one_sentence(
    )
 def test_visualization_minimal_render_table_empty_text_no_headers(
    en_vocab,
 ):
    formats = [
        AttributeFormat("tree_left"),
        AttributeFormat("dep_"),
        AttributeFormat("text"),
        AttributeFormat("lemma_"),
        AttributeFormat("pos_"),
        AttributeFormat("tag_"),
        AttributeFormat("morph"),
        AttributeFormat("ent_type_"),
    ]
    assert Visualizer().render_table(Doc(en_vocab), formats).strip() == ""
 def test_visualization_minimal_render_table_empty_text_headers(
    en_vocab,
 ):
    formats = [
        AttributeFormat("tree_left", name="tree"),
        AttributeFormat("dep_"),
        AttributeFormat("text"),
        AttributeFormat("lemma_"),
        AttributeFormat("pos_"),
        AttributeFormat("tag_"),
        AttributeFormat("morph"),
        AttributeFormat("ent_type_", name="ent"),
    ]
    assert Visualizer().render_table(Doc(en_vocab), formats).strip() == ""
 def test_visualization_minimal_render_table_permitted_values(
    fully_featured_doc_one_sentence,
 ):
@ -640,7 +672,8 @@ def test_visualization_rich_render_table_two_sentences(
        else "\n  tree   dep        index   text      lemma     pos     tag   morph                       ent   \n------   --------   -----   -------   -------   -----   ---   -------------------------   ------\n  ╔>╔═   poss           0   Sarah     sarah     PROPN   NNP   NounType=prop|Number=sing   PERSON\n  ║ ╚>   case           1   's        's        PART    POS   Poss=yes                          \n╔>╚═══   nsubj          2   sister    sister    NOUN    NN    Number=sing                       \n╠═════   ROOT           3   flew      fly       VERB    VBD   Tense=past|VerbForm=fin           \n╠>╔═══   prep           4   to        to        ADP     IN                                      \n║ ║ ╔>   compound       5   Silicon   silicon   PROPN   NNP   NounType=prop|Number=sing   GPE   \n║ ╚>╚═   pobj           6   Valley    valley    PROPN   NNP   NounType=prop|Number=sing   GPE   \n╠══>╔═   prep           7   via       via       ADP     IN                                      \n║   ╚>   pobj           8   London    london    PROPN   NNP   NounType=prop|Number=sing   GPE   \n╚════>   punct          9   .         .         PUNCT   .     PunctType=peri                    \n\n\ntree   dep     index   text    lemma   pos     tag   morph                                                    ent\n----   -----   -----   -----   -----   -----   ---   ------------------------------------------------------   ---\n  ╔>   nsubj      10   She     she     PRON    PRP   Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs       \n  ╠═   ROOT       11   loved   love    VERB    VBD   Tense=Past|VerbForm=Fin                                     \n  ╠>   dobj       12   it      it      PRON    PRP   Case=Acc|Gender=Neut|Number=Sing|Person=3|PronType=Prs      \n  ╚>   punct      13   .       .       PUNCT   .     PunctType=peri                                              \n\n"
    )
-def test_render_text_with_text_format(
+
 def test_visualization_text_with_text_format(
    fully_featured_doc_two_sentences,
 ):
    formats = [
@ -658,15 +691,18 @@ def test_render_text_with_text_format(
            value_dependent_bg_colors={"PERSON": 12},
        ),
        AttributeFormat(
-            "lemma_",
+            "lemma_", fg_color=50, bg_color=53, permitted_values=("fly", "valley")
            fg_color=50,
            bg_color=53,
            permitted_values=("fly", "valley")
        ),
    ]
-    assert Visualizer().render_text(fully_featured_doc_two_sentences, formats) == "\x1b[38;5;50;48;5;53mSarah\x1b[0m \x1b[38;5;50;48;5;12mPERSON\x1b[0m's sister \x1b[38;5;50;48;5;53mflew\x1b[0m \x1b[38;5;50;48;5;53mfly\x1b[0m to \x1b[38;5;50;48;5;53mSilicon\x1b[0m \x1b[38;5;50mGPE\x1b[0m \x1b[38;5;50;48;5;53mValley\x1b[0m \x1b[38;5;50mGPE\x1b[0m \x1b[38;5;50;48;5;53mvalley\x1b[0m via \x1b[38;5;50;48;5;53mLondon\x1b[0m \x1b[38;5;50mGPE\x1b[0m. She loved it." if supports_ansi else "Sarah PERSON's sister flew fly to Silicon GPE Valley GPE valley via London GPE. She loved it."
+    assert (
        Visualizer().render_text(fully_featured_doc_two_sentences, formats)
        == "\x1b[38;5;50;48;5;53mSarah\x1b[0m \x1b[38;5;50;48;5;12mPERSON\x1b[0m's sister \x1b[38;5;50;48;5;53mflew\x1b[0m \x1b[38;5;50;48;5;53mfly\x1b[0m to \x1b[38;5;50;48;5;53mSilicon\x1b[0m \x1b[38;5;50mGPE\x1b[0m \x1b[38;5;50;48;5;53mValley\x1b[0m \x1b[38;5;50mGPE\x1b[0m \x1b[38;5;50;48;5;53mvalley\x1b[0m via \x1b[38;5;50;48;5;53mLondon\x1b[0m \x1b[38;5;50mGPE\x1b[0m. She loved it."
        if supports_ansi
        else "Sarah PERSON's sister flew fly to Silicon GPE Valley GPE valley via London GPE. She loved it."
    )
-def test_render_text_without_text_format(
+
 def test_visualization_render_text_without_text_format(
    fully_featured_doc_two_sentences,
 ):
    formats = [
@ -675,9 +711,143 @@ def test_render_text_without_text_format(
            value_dependent_fg_colors={"PERSON": 50},
            value_dependent_bg_colors={"PERSON": 12},
        ),
-        AttributeFormat(
+        AttributeFormat("lemma_", permitted_values=("fly", "valley")),
            "lemma_",
            permitted_values=("fly", "valley")
        ),
    ]
-    assert Visualizer().render_text(fully_featured_doc_two_sentences, formats) == "Sarah \x1b[38;5;50;48;5;12mPERSON\x1b[0m's sister flew fly to Silicon GPE Valley GPE valley via London GPE. She loved it." if supports_ansi else "Sarah PERSON's sister flew fly to Silicon GPE Valley GPE valley via London GPE. She loved it."
+    assert (
        Visualizer().render_text(fully_featured_doc_two_sentences, formats)
        == "Sarah \x1b[38;5;50;48;5;12mPERSON\x1b[0m's sister flew fly to Silicon GPE Valley GPE valley via London GPE. She loved it."
        if supports_ansi
        else "Sarah PERSON's sister flew fly to Silicon GPE Valley GPE valley via London GPE. She loved it."
    )
 def test_visualization_minimal_render_instances_two_sentences_type_non_grouping(
    fully_featured_doc_two_sentences,
 ):
    display_columns = [
        AttributeFormat("dep_"),
        AttributeFormat("text"),
        AttributeFormat("lemma_"),
        AttributeFormat("pos_"),
        AttributeFormat("tag_"),
        AttributeFormat("morph"),
        AttributeFormat("ent_type_"),
    ]
    search_attributes = [AttributeFormat("ent_type_")]
    assert (
        Visualizer().render_instances(
            fully_featured_doc_two_sentences,
            search_attributes=search_attributes,
            display_columns=display_columns,
            group=False,
        )
        == "\nposs       Sarah     sarah     PROPN   NNP   NounType=prop|Number=sing   PERSON\ncompound   Silicon   silicon   PROPN   NNP   NounType=prop|Number=sing   GPE   \npobj       Valley    valley    PROPN   NNP   NounType=prop|Number=sing   GPE   \npobj       London    london    PROPN   NNP   NounType=prop|Number=sing   GPE   \n"
    )
 def test_visualization_minimal_render_instances_two_sentences_value_non_grouping(
    fully_featured_doc_two_sentences,
 ):
    display_columns = [
        AttributeFormat("dep_"),
        AttributeFormat("text"),
        AttributeFormat("lemma_"),
        AttributeFormat("pos_"),
        AttributeFormat("tag_"),
        AttributeFormat("morph"),
        AttributeFormat("ent_type_"),
    ]
    search_attributes = [AttributeFormat("ent_type_", permitted_values=["PERSON"])]
    assert (
        Visualizer().render_instances(
            fully_featured_doc_two_sentences,
            search_attributes=search_attributes,
            display_columns=display_columns,
            group=False,
        )
        == "\nposs   Sarah   sarah   PROPN   NNP   NounType=prop|Number=sing   PERSON\n"
    )
 def test_visualization_minimal_render_instances_two_sentences_missing_value_non_grouping(
    fully_featured_doc_two_sentences,
 ):
    display_columns = [
        AttributeFormat("dep_", name="dep"),
        AttributeFormat("text", name="text"),
        AttributeFormat("lemma_"),
        AttributeFormat("pos_"),
        AttributeFormat("tag_"),
        AttributeFormat("morph"),
        AttributeFormat("ent_type_"),
    ]
    search_attributes = [AttributeFormat("ent_type_", permitted_values=["PERSONN"])]
    assert (
        Visualizer().render_instances(
            fully_featured_doc_two_sentences,
            search_attributes=search_attributes,
            display_columns=display_columns,
            group=False,
        )
        == "\ndep   text               \n---   ----               \n"
    )
 def test_visualization_minimal_render_instances_two_sentences_type_grouping(
    fully_featured_doc_two_sentences,
 ):
    display_columns = [
        AttributeFormat("dep_"),
        AttributeFormat("text"),
        AttributeFormat("lemma_"),
        AttributeFormat("pos_"),
        AttributeFormat("tag_"),
        AttributeFormat("morph"),
        AttributeFormat("ent_type_"),
    ]
    search_attributes = [AttributeFormat("ent_type_"), AttributeFormat("lemma_")]
    assert (
        Visualizer().render_instances(
            fully_featured_doc_two_sentences,
            search_attributes=search_attributes,
            display_columns=display_columns,
            group=True,
        )
        == "\npobj       London    london    PROPN   NNP   NounType=prop|Number=sing   GPE   \ncompound   Silicon   silicon   PROPN   NNP   NounType=prop|Number=sing   GPE   \npobj       Valley    valley    PROPN   NNP   NounType=prop|Number=sing   GPE   \nposs       Sarah     sarah     PROPN   NNP   NounType=prop|Number=sing   PERSON\n"
    )
 def test_visualization_minimal_render_instances_two_sentences_type_grouping_colors(
    fully_featured_doc_two_sentences,
 ):
    display_columns = [
        AttributeFormat("dep_", fg_color=20),
        AttributeFormat("text", bg_color=30),
        AttributeFormat("lemma_"),
        AttributeFormat("pos_"),
        AttributeFormat("tag_"),
        AttributeFormat("morph"),
        AttributeFormat("ent_type_"),
    ]
    search_attributes = [AttributeFormat("ent_type_"), AttributeFormat("lemma_")]
    assert (
        Visualizer().render_instances(
            fully_featured_doc_two_sentences,
            search_attributes=search_attributes,
            display_columns=display_columns,
            group=True,
        )
        == "\n\x1b[38;5;20mpobj    \x1b[0m   \x1b[48;5;30mLondon \x1b[0m   london    PROPN   NNP   NounType=prop|Number=sing   GPE   \n\x1b[38;5;20mcompound\x1b[0m   \x1b[48;5;30mSilicon\x1b[0m   silicon   PROPN   NNP   NounType=prop|Number=sing   GPE   \n\x1b[38;5;20mpobj    \x1b[0m   \x1b[48;5;30mValley \x1b[0m   valley    PROPN   NNP   NounType=prop|Number=sing   GPE   \n\x1b[38;5;20mposs    \x1b[0m   \x1b[48;5;30mSarah  \x1b[0m   sarah     PROPN   NNP   NounType=prop|Number=sing   PERSON\n"
        if supports_ansi
        else "npobj       London    london    PROPN   NNP   NounType=prop|Number=sing   GPE   \ncompound   Silicon   silicon   PROPN   NNP   NounType=prop|Number=sing   GPE   \npobj       Valley    valley    PROPN   NNP   NounType=prop|Number=sing   GPE   \nposs       Sarah     sarah     PROPN   NNP   NounType=prop|Number=sing   PERSON\n"
    )
--- a/spacy/visualization.py
+++ b/spacy/visualization.py
@ -344,9 +344,12 @@ class Visualizer:
                elif column.attribute == "tree_right":
                    width = len(tree_right[0])
                else:
-                    width = max(
+                    if len(sent) > 0:
-                        len(column.render(token, ignore_colors=True)) for token in sent
+                        width = max(
-                    )
+                            len(column.render(token, ignore_colors=True)) for token in sent
                        )
                    else:
                        width = 0
                    if column.max_width is not None:
                        width = min(width, column.max_width)
                width = max(width, len(column.name))
@ -406,3 +409,65 @@ class Visualizer:
            this_token_strings.append(token.whitespace_)
            return_string += "".join(this_token_strings)
        return return_string
    def render_instances(
        self,
        doc: Doc,
        *,
        search_attributes: list[AttributeFormat],
        display_columns: list[AttributeFormat],
        group: bool,
        spacing: int = 3,
    ) -> str:
        def filter(token: Token) -> bool:
            for attribute in search_attributes:
                value = attribute.render(token, ignore_colors=True)
                if len(value) == 0:
                    return False
            return True
        tokens = [token for token in doc if filter(token)]
        if group:
            tokens.sort(
                key=(
                    lambda token: [attribute.render(token, ignore_colors=True)
                    for attribute in search_attributes]
                )
            )
        widths = []
        for column in display_columns:
            if len(tokens) > 0:
                width = max(
                    len(column.render(token, ignore_colors=True)) for token in tokens
                )
            else:
                width = 0
            if column.max_width is not None:
                width = min(width, column.max_width)
            width = max(width, len(column.name))
            widths.append(width)
            data = [
                [
                    column.render(token)
                    for column_index, column in enumerate(display_columns)
                ]
                for token in tokens
            ]
            if len([1 for c in display_columns if len(c.name) > 0]) > 0:
                header = [c.name for c in display_columns]
            else:
                header = None
            aligns = [c.aligns for c in display_columns]
            fg_colors = [c.fg_color for c in display_columns]
            bg_colors = [c.bg_color for c in display_columns]
        return wasabi.table(
            data,
            header=header,
            divider=True,
            aligns=aligns,
            widths=widths,
            fg_colors=fg_colors,
            bg_colors=bg_colors,
            spacing=spacing,
        )