From 0566c3a166c7ccfb5a1bddb025dddf9c576a9ed2 Mon Sep 17 00:00:00 2001 From: Connor Brinton Date: Thu, 13 Jul 2023 11:33:05 -0400 Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20Escape=20annotated=20HTML=20tags?= =?UTF-8?q?=20in=20span=20renderer=20(#12817)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These changes add a missing call to `escape_html` in the displaCy span renderer. Previously span-annotated tokens would be inserted into the page markup without being escaped, resulting in potentially incorrect rendering. When I encountered this issue, it resulted in some docs and span underlines being superimposed on top of properly rendered docs and span underlines near the beginning of the visualization (due to an unescaped `` tag). --- spacy/displacy/render.py | 3 +-- spacy/tests/test_displacy.py | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index 86869e3b8..47407bcb7 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -1,4 +1,3 @@ -import itertools import uuid from typing import Any, Dict, List, Optional, Tuple, Union @@ -218,7 +217,7 @@ class SpanRenderer: + (self.offset_step * (len(entities) - 1)) ) markup += self.span_template.format( - text=token["text"], + text=escape_html(token["text"]), span_slices=slices, span_starts=starts, total_height=total_height, diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py index ce103068a..1570f8d09 100644 --- a/spacy/tests/test_displacy.py +++ b/spacy/tests/test_displacy.py @@ -377,3 +377,22 @@ def test_displacy_manual_sorted_entities(): html = displacy.render(doc, style="ent", manual=True) assert html.find("FIRST") < html.find("SECOND") + + +@pytest.mark.issue(12816) +def test_issue12816(en_vocab) -> None: + """Test that displaCy's span visualizer escapes annotated HTML tags correctly.""" + # Create a doc containing an annotated word and an unannotated HTML tag + doc = Doc(en_vocab, words=["test", ""]) + doc.spans["sc"] = [Span(doc, 0, 1, label="test")] + + # Verify that the HTML tag is escaped when unannotated + html = displacy.render(doc, style="span") + assert "<TEST>" in html + + # Annotate the HTML tag + doc.spans["sc"].append(Span(doc, 1, 2, label="test")) + + # Verify that the HTML tag is still escaped + html = displacy.render(doc, style="span") + assert "<TEST>" in html