Add render_text

2025-08-04 04:10:20 +03:00 · 2021-12-08 17:47:29 +01:00 · 2021-12-08 17:47:29 +01:00 · bd00611259
commit bd00611259
parent 49f3fd39b9
3 changed files with 121 additions and 64 deletions
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -549,7 +549,7 @@ def fully_featured_doc_two_sentences(en_vocab):
        "it",
        "."
    ]
-    spaces = [False, True, True, True, True, True, True, True, False, False, True, True, False, False]
+    spaces = [False, True, True, True, True, True, True, True, False, True, True, True, False, False]
    pos = [
        "PROPN",
        "PART",
--- a/spacy/tests/test_visualization.py
+++ b/spacy/tests/test_visualization.py
@ -639,3 +639,45 @@ def test_visualization_rich_render_table_two_sentences(
        if supports_ansi
        else "\n  tree   dep        index   text      lemma     pos     tag   morph                       ent   \n------   --------   -----   -------   -------   -----   ---   -------------------------   ------\n  ╔>╔═   poss           0   Sarah     sarah     PROPN   NNP   NounType=prop|Number=sing   PERSON\n  ║ ╚>   case           1   's        's        PART    POS   Poss=yes                          \n╔>╚═══   nsubj          2   sister    sister    NOUN    NN    Number=sing                       \n╠═════   ROOT           3   flew      fly       VERB    VBD   Tense=past|VerbForm=fin           \n╠>╔═══   prep           4   to        to        ADP     IN                                      \n║ ║ ╔>   compound       5   Silicon   silicon   PROPN   NNP   NounType=prop|Number=sing   GPE   \n║ ╚>╚═   pobj           6   Valley    valley    PROPN   NNP   NounType=prop|Number=sing   GPE   \n╠══>╔═   prep           7   via       via       ADP     IN                                      \n║   ╚>   pobj           8   London    london    PROPN   NNP   NounType=prop|Number=sing   GPE   \n╚════>   punct          9   .         .         PUNCT   .     PunctType=peri                    \n\n\ntree   dep     index   text    lemma   pos     tag   morph                                                    ent\n----   -----   -----   -----   -----   -----   ---   ------------------------------------------------------   ---\n  ╔>   nsubj      10   She     she     PRON    PRP   Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs       \n  ╠═   ROOT       11   loved   love    VERB    VBD   Tense=Past|VerbForm=Fin                                     \n  ╠>   dobj       12   it      it      PRON    PRP   Case=Acc|Gender=Neut|Number=Sing|Person=3|PronType=Prs      \n  ╚>   punct      13   .       .       PUNCT   .     PunctType=peri                                              \n\n"
    )
 def test_render_text_with_text_format(
    fully_featured_doc_two_sentences,
 ):
    formats = [
        AttributeFormat(
            "ent_type_",
            fg_color=50,
            value_dependent_fg_colors={"PERSON": 50},
            value_dependent_bg_colors={"PERSON": 12},
        ),
        AttributeFormat(
            "text",
            fg_color=50,
            bg_color=53,
            value_dependent_fg_colors={"PERSON": 50},
            value_dependent_bg_colors={"PERSON": 12},
        ),
        AttributeFormat(
            "lemma_",
            fg_color=50,
            bg_color=53,
            permitted_values=("fly", "valley")
        ),
    ]
    assert Visualizer().render_text(fully_featured_doc_two_sentences, formats) == "\x1b[38;5;50;48;5;53mSarah\x1b[0m \x1b[38;5;50;48;5;12mPERSON\x1b[0m's sister \x1b[38;5;50;48;5;53mflew\x1b[0m \x1b[38;5;50;48;5;53mfly\x1b[0m to \x1b[38;5;50;48;5;53mSilicon\x1b[0m \x1b[38;5;50mGPE\x1b[0m \x1b[38;5;50;48;5;53mValley\x1b[0m \x1b[38;5;50mGPE\x1b[0m \x1b[38;5;50;48;5;53mvalley\x1b[0m via \x1b[38;5;50;48;5;53mLondon\x1b[0m \x1b[38;5;50mGPE\x1b[0m. She loved it." if supports_ansi else "Sarah PERSON's sister flew fly to Silicon GPE Valley GPE valley via London GPE. She loved it."
 def test_render_text_without_text_format(
    fully_featured_doc_two_sentences,
 ):
    formats = [
        AttributeFormat(
            "ent_type_",
            value_dependent_fg_colors={"PERSON": 50},
            value_dependent_bg_colors={"PERSON": 12},
        ),
        AttributeFormat(
            "lemma_",
            permitted_values=("fly", "valley")
        ),
    ]
    assert Visualizer().render_text(fully_featured_doc_two_sentences, formats) == "Sarah \x1b[38;5;50;48;5;12mPERSON\x1b[0m's sister flew fly to Silicon GPE Valley GPE valley via London GPE. She loved it." if supports_ansi else "Sarah PERSON's sister flew fly to Silicon GPE Valley GPE valley via London GPE. She loved it."
--- a/spacy/visualization.py
+++ b/spacy/visualization.py
@ -6,63 +6,6 @@ from spacy.tokens import Span, Token, Doc
 from spacy.util import working_dir
 class AttributeFormat:
    def __init__(
        self,
        attribute: str,
        *,
        name: str = "",
        aligns: str = "l",
        max_width: int = None,
        fg_color: Union[str, int] = None,
        bg_color: Union[str, int] = None,
        permitted_values: tuple = None,
        value_dependent_fg_colors: dict[str, Union[str, int]] = None,
        value_dependent_bg_colors: dict[str, Union[str, int]] = None,
    ):
        self.attribute = attribute
        self.name = name
        self.aligns = aligns
        self.max_width = max_width
        self.fg_color = fg_color
        self.bg_color = bg_color
        self.permitted_values = permitted_values
        self.value_dependent_fg_colors = value_dependent_fg_colors
        self.value_dependent_bg_colors = value_dependent_bg_colors
        self.printer = wasabi.Printer(no_print=True)
    def render(
        self,
        token: Token,
        *,
        ignore_colors: bool = False,
    ) -> str:
        obj = token
        parts = self.attribute.split(".")
        for part in parts[:-1]:
            obj = getattr(obj, part)
        value = str(getattr(obj, parts[-1]))
        if self.permitted_values is not None and value not in (
            str(v) for v in self.permitted_values
        ):
            return ""
        if self.max_width is not None:
            value = value[: self.max_width]
        fg_color = (
            self.value_dependent_fg_colors.get(value, None)
            if not ignore_colors and self.value_dependent_fg_colors is not None
            else None
        )
        bg_color = (
            self.value_dependent_bg_colors.get(value, None)
            if not ignore_colors and self.value_dependent_bg_colors is not None
            else None
        )
        if fg_color is not None or bg_color is not None:
            value = self.printer.text(value, color=fg_color, bg_color=bg_color)
        return value
 SPACE = 0
 HALF_HORIZONTAL_LINE = 1  # the half is the half further away from the root
 FULL_HORIZONTAL_LINE = 3
@ -98,6 +41,65 @@ ROOT_LEFT_CHARS = {
 }
 class AttributeFormat:
    def __init__(
        self,
        attribute: str,
        *,
        name: str = "",
        aligns: str = "l",
        max_width: int = None,
        fg_color: Union[str, int] = None,
        bg_color: Union[str, int] = None,
        permitted_values: tuple = None,
        value_dependent_fg_colors: dict[str, Union[str, int]] = None,
        value_dependent_bg_colors: dict[str, Union[str, int]] = None,
    ):
        self.attribute = attribute
        self.name = name
        self.aligns = aligns
        self.max_width = max_width
        self.fg_color = fg_color
        self.bg_color = bg_color
        self.permitted_values = permitted_values
        self.value_dependent_fg_colors = value_dependent_fg_colors
        self.value_dependent_bg_colors = value_dependent_bg_colors
        self.printer = wasabi.Printer(no_print=True)
    def render(
        self,
        token: Token,
        *,
        ignore_colors: bool = False,
        render_all_colors_within_values: bool = False,
    ) -> str:
        obj = token
        parts = self.attribute.split(".")
        for part in parts[:-1]:
            obj = getattr(obj, part)
        value = str(getattr(obj, parts[-1]))
        if self.permitted_values is not None and value not in (
            str(v) for v in self.permitted_values
        ):
            return ""
        if self.max_width is not None:
            value = value[: self.max_width]
        fg_color = None
        bg_color = None
        if not ignore_colors and len(value) > 0:
            if self.value_dependent_fg_colors is not None:
                fg_color = self.value_dependent_fg_colors.get(value, None)
            if fg_color is None and render_all_colors_within_values:
                fg_color = self.fg_color
            if self.value_dependent_bg_colors is not None:
                bg_color = self.value_dependent_bg_colors.get(value, None)
            if bg_color is None and render_all_colors_within_values:
                bg_color = self.bg_color
        if fg_color is not None or bg_color is not None:
            value = self.printer.text(value, color=fg_color, bg_color=bg_color)
        return value
 class Visualizer:
    @staticmethod
    def render_dependency_tree(sent: Span, root_right: bool) -> list[str]:
@ -384,10 +386,23 @@ class Visualizer:
    def render_text(self, doc: Doc, attributes: list[AttributeFormat]) -> str:
        return_string = ""
        text_attributes = [a for a in attributes if a.attribute == "text"]
        text_attribute = (
            text_attributes[0] if len(text_attributes) > 0 else AttributeFormat("text")
        )
        for token in doc:
-            return_string += token.text_with_ws
+            this_token_strings = [""]
-            for attribute in attributes:
+            for attribute in (a for a in attributes if a.attribute != "text"):
-                if self.get_entity(
+                attribute_text = attribute.render(
-                    token,
+                    token, render_all_colors_within_values=True
-                ):
+                )
-                    pass
+                if attribute_text is not None and len(attribute_text) > 0:
                    this_token_strings.append(" " + attribute_text)
            this_token_strings[0] = (
                token.text
                if len(this_token_strings) == 1
                else text_attribute.render(token, render_all_colors_within_values=True)
            )
            this_token_strings.append(token.whitespace_)
            return_string += "".join(this_token_strings)
        return return_string