Add render_text

2025-08-04 04:10:20 +03:00 · 2021-12-08 17:47:29 +01:00 · 2021-12-08 17:47:29 +01:00 · bd00611259
commit bd00611259
parent 49f3fd39b9
3 changed files with 121 additions and 64 deletions
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -549,7 +549,7 @@ def fully_featured_doc_two_sentences(en_vocab):
        "it",
        "."
    ]
-    spaces = [False, True, True, True, True, True, True, True, False, False, True, True, False, False]
+    spaces = [False, True, True, True, True, True, True, True, False, True, True, True, False, False]
    pos = [
        "PROPN",
        "PART",
--- a/spacy/tests/test_visualization.py
+++ b/spacy/tests/test_visualization.py
@ -639,3 +639,45 @@ def test_visualization_rich_render_table_two_sentences(
        if supports_ansi
        else "\n  tree   dep        index   text      lemma     pos     tag   morph                       ent   \n------   --------   -----   -------   -------   -----   ---   -------------------------   ------\n  ╔>╔═   poss           0   Sarah     sarah     PROPN   NNP   NounType=prop|Number=sing   PERSON\n  ║ ╚>   case           1   's        's        PART    POS   Poss=yes                          \n╔>╚═══   nsubj          2   sister    sister    NOUN    NN    Number=sing                       \n╠═════   ROOT           3   flew      fly       VERB    VBD   Tense=past|VerbForm=fin           \n╠>╔═══   prep           4   to        to        ADP     IN                                      \n║ ║ ╔>   compound       5   Silicon   silicon   PROPN   NNP   NounType=prop|Number=sing   GPE   \n║ ╚>╚═   pobj           6   Valley    valley    PROPN   NNP   NounType=prop|Number=sing   GPE   \n╠══>╔═   prep           7   via       via       ADP     IN                                      \n║   ╚>   pobj           8   London    london    PROPN   NNP   NounType=prop|Number=sing   GPE   \n╚════>   punct          9   .         .         PUNCT   .     PunctType=peri                    \n\n\ntree   dep     index   text    lemma   pos     tag   morph                                                    ent\n----   -----   -----   -----   -----   -----   ---   ------------------------------------------------------   ---\n  ╔>   nsubj      10   She     she     PRON    PRP   Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs       \n  ╠═   ROOT       11   loved   love    VERB    VBD   Tense=Past|VerbForm=Fin                                     \n  ╠>   dobj       12   it      it      PRON    PRP   Case=Acc|Gender=Neut|Number=Sing|Person=3|PronType=Prs      \n  ╚>   punct      13   .       .       PUNCT   .     PunctType=peri                                              \n\n"
    )
+
+def test_render_text_with_text_format(
+    fully_featured_doc_two_sentences,
+):
+    formats = [
+        AttributeFormat(
+            "ent_type_",
+            fg_color=50,
+            value_dependent_fg_colors={"PERSON": 50},
+            value_dependent_bg_colors={"PERSON": 12},
+        ),
+        AttributeFormat(
+            "text",
+            fg_color=50,
+            bg_color=53,
+            value_dependent_fg_colors={"PERSON": 50},
+            value_dependent_bg_colors={"PERSON": 12},
+        ),
+        AttributeFormat(
+            "lemma_",
+            fg_color=50,
+            bg_color=53,
+            permitted_values=("fly", "valley")
+        ),
+    ]
+    assert Visualizer().render_text(fully_featured_doc_two_sentences, formats) == "\x1b[38;5;50;48;5;53mSarah\x1b[0m \x1b[38;5;50;48;5;12mPERSON\x1b[0m's sister \x1b[38;5;50;48;5;53mflew\x1b[0m \x1b[38;5;50;48;5;53mfly\x1b[0m to \x1b[38;5;50;48;5;53mSilicon\x1b[0m \x1b[38;5;50mGPE\x1b[0m \x1b[38;5;50;48;5;53mValley\x1b[0m \x1b[38;5;50mGPE\x1b[0m \x1b[38;5;50;48;5;53mvalley\x1b[0m via \x1b[38;5;50;48;5;53mLondon\x1b[0m \x1b[38;5;50mGPE\x1b[0m. She loved it." if supports_ansi else "Sarah PERSON's sister flew fly to Silicon GPE Valley GPE valley via London GPE. She loved it."
+
+def test_render_text_without_text_format(
+    fully_featured_doc_two_sentences,
+):
+    formats = [
+        AttributeFormat(
+            "ent_type_",
+            value_dependent_fg_colors={"PERSON": 50},
+            value_dependent_bg_colors={"PERSON": 12},
+        ),
+        AttributeFormat(
+            "lemma_",
+            permitted_values=("fly", "valley")
+        ),
+    ]
+    assert Visualizer().render_text(fully_featured_doc_two_sentences, formats) == "Sarah \x1b[38;5;50;48;5;12mPERSON\x1b[0m's sister flew fly to Silicon GPE Valley GPE valley via London GPE. She loved it." if supports_ansi else "Sarah PERSON's sister flew fly to Silicon GPE Valley GPE valley via London GPE. She loved it."
--- a/spacy/visualization.py
+++ b/spacy/visualization.py
@ -6,63 +6,6 @@ from spacy.tokens import Span, Token, Doc
 from spacy.util import working_dir


-class AttributeFormat:
-    def __init__(
-        self,
-        attribute: str,
-        *,
-        name: str = "",
-        aligns: str = "l",
-        max_width: int = None,
-        fg_color: Union[str, int] = None,
-        bg_color: Union[str, int] = None,
-        permitted_values: tuple = None,
-        value_dependent_fg_colors: dict[str, Union[str, int]] = None,
-        value_dependent_bg_colors: dict[str, Union[str, int]] = None,
-    ):
-        self.attribute = attribute
-        self.name = name
-        self.aligns = aligns
-        self.max_width = max_width
-        self.fg_color = fg_color
-        self.bg_color = bg_color
-        self.permitted_values = permitted_values
-        self.value_dependent_fg_colors = value_dependent_fg_colors
-        self.value_dependent_bg_colors = value_dependent_bg_colors
-        self.printer = wasabi.Printer(no_print=True)
-
-    def render(
-        self,
-        token: Token,
-        *,
-        ignore_colors: bool = False,
-    ) -> str:
-        obj = token
-        parts = self.attribute.split(".")
-        for part in parts[:-1]:
-            obj = getattr(obj, part)
-        value = str(getattr(obj, parts[-1]))
-        if self.permitted_values is not None and value not in (
-            str(v) for v in self.permitted_values
-        ):
-            return ""
-        if self.max_width is not None:
-            value = value[: self.max_width]
-        fg_color = (
-            self.value_dependent_fg_colors.get(value, None)
-            if not ignore_colors and self.value_dependent_fg_colors is not None
-            else None
-        )
-        bg_color = (
-            self.value_dependent_bg_colors.get(value, None)
-            if not ignore_colors and self.value_dependent_bg_colors is not None
-            else None
-        )
-        if fg_color is not None or bg_color is not None:
-            value = self.printer.text(value, color=fg_color, bg_color=bg_color)
-        return value
-
-
 SPACE = 0
 HALF_HORIZONTAL_LINE = 1  # the half is the half further away from the root
 FULL_HORIZONTAL_LINE = 3
@ -98,6 +41,65 @@ ROOT_LEFT_CHARS = {
 }


+class AttributeFormat:
+    def __init__(
+        self,
+        attribute: str,
+        *,
+        name: str = "",
+        aligns: str = "l",
+        max_width: int = None,
+        fg_color: Union[str, int] = None,
+        bg_color: Union[str, int] = None,
+        permitted_values: tuple = None,
+        value_dependent_fg_colors: dict[str, Union[str, int]] = None,
+        value_dependent_bg_colors: dict[str, Union[str, int]] = None,
+    ):
+        self.attribute = attribute
+        self.name = name
+        self.aligns = aligns
+        self.max_width = max_width
+        self.fg_color = fg_color
+        self.bg_color = bg_color
+        self.permitted_values = permitted_values
+        self.value_dependent_fg_colors = value_dependent_fg_colors
+        self.value_dependent_bg_colors = value_dependent_bg_colors
+        self.printer = wasabi.Printer(no_print=True)
+
+    def render(
+        self,
+        token: Token,
+        *,
+        ignore_colors: bool = False,
+        render_all_colors_within_values: bool = False,
+    ) -> str:
+        obj = token
+        parts = self.attribute.split(".")
+        for part in parts[:-1]:
+            obj = getattr(obj, part)
+        value = str(getattr(obj, parts[-1]))
+        if self.permitted_values is not None and value not in (
+            str(v) for v in self.permitted_values
+        ):
+            return ""
+        if self.max_width is not None:
+            value = value[: self.max_width]
+        fg_color = None
+        bg_color = None
+        if not ignore_colors and len(value) > 0:
+            if self.value_dependent_fg_colors is not None:
+                fg_color = self.value_dependent_fg_colors.get(value, None)
+            if fg_color is None and render_all_colors_within_values:
+                fg_color = self.fg_color
+            if self.value_dependent_bg_colors is not None:
+                bg_color = self.value_dependent_bg_colors.get(value, None)
+            if bg_color is None and render_all_colors_within_values:
+                bg_color = self.bg_color
+        if fg_color is not None or bg_color is not None:
+            value = self.printer.text(value, color=fg_color, bg_color=bg_color)
+        return value
+
+
 class Visualizer:
    @staticmethod
    def render_dependency_tree(sent: Span, root_right: bool) -> list[str]:
@ -384,10 +386,23 @@ class Visualizer:

    def render_text(self, doc: Doc, attributes: list[AttributeFormat]) -> str:
        return_string = ""
+        text_attributes = [a for a in attributes if a.attribute == "text"]
+        text_attribute = (
+            text_attributes[0] if len(text_attributes) > 0 else AttributeFormat("text")
+        )
        for token in doc:
-            return_string += token.text_with_ws
-            for attribute in attributes:
-                if self.get_entity(
-                    token,
-                ):
-                    pass
+            this_token_strings = [""]
+            for attribute in (a for a in attributes if a.attribute != "text"):
+                attribute_text = attribute.render(
+                    token, render_all_colors_within_values=True
+                )
+                if attribute_text is not None and len(attribute_text) > 0:
+                    this_token_strings.append(" " + attribute_text)
+            this_token_strings[0] = (
+                token.text
+                if len(this_token_strings) == 1
+                else text_attribute.render(token, render_all_colors_within_values=True)
+            )
+            this_token_strings.append(token.whitespace_)
+            return_string += "".join(this_token_strings)
+        return return_string