spaCy/spacy/tests/test_displacy.py

import numpy
import pytest

from spacy import displacy
from spacy.displacy.render import DependencyRenderer, EntityRenderer, SpanRenderer
from spacy.lang.en import English
from spacy.lang.fa import Persian
from spacy.tokens import Doc, Span


@pytest.mark.issue(2361)
def test_issue2361(de_vocab):
    """Test if < is escaped when rendering"""
    chars = ("&lt;", "&gt;", "&amp;", "&quot;")
    words = ["<", ">", "&", '"']
    doc = Doc(de_vocab, words=words, deps=["dep"] * len(words))
    html = displacy.render(doc)
    for char in chars:
        assert char in html


@pytest.mark.issue(2728)
def test_issue2728(en_vocab):
    """Test that displaCy ENT visualizer escapes HTML correctly."""
    doc = Doc(en_vocab, words=["test", "<RELEASE>", "test"])
    doc.ents = [Span(doc, 0, 1, label="TEST")]
    html = displacy.render(doc, style="ent")
    assert "&lt;RELEASE&gt;" in html
    doc.ents = [Span(doc, 1, 2, label="TEST")]
    html = displacy.render(doc, style="ent")
    assert "&lt;RELEASE&gt;" in html


@pytest.mark.issue(3288)
def test_issue3288(en_vocab):
    """Test that retokenization works correctly via displaCy when punctuation
    is merged onto the preceeding token and tensor is resized."""
    words = ["Hello", "World", "!", "When", "is", "this", "breaking", "?"]
    heads = [1, 1, 1, 4, 4, 6, 4, 4]
    deps = ["intj", "ROOT", "punct", "advmod", "ROOT", "det", "nsubj", "punct"]
    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
    doc.tensor = numpy.zeros((len(words), 96), dtype="float32")
    displacy.render(doc)


@pytest.mark.issue(3531)
def test_issue3531():
    """Test that displaCy renderer doesn't require "settings" key."""
    example_dep = {
        "words": [
            {"text": "But", "tag": "CCONJ"},
            {"text": "Google", "tag": "PROPN"},
            {"text": "is", "tag": "VERB"},
            {"text": "starting", "tag": "VERB"},
            {"text": "from", "tag": "ADP"},
            {"text": "behind.", "tag": "ADV"},
        ],
        "arcs": [
            {"start": 0, "end": 3, "label": "cc", "dir": "left"},
            {"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
            {"start": 2, "end": 3, "label": "aux", "dir": "left"},
            {"start": 3, "end": 4, "label": "prep", "dir": "right"},
            {"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
        ],
    }
    example_ent = {
        "text": "But Google is starting from behind.",
        "ents": [{"start": 4, "end": 10, "label": "ORG"}],
    }
    dep_html = displacy.render(example_dep, style="dep", manual=True)
    assert dep_html
    ent_html = displacy.render(example_ent, style="ent", manual=True)
    assert ent_html


@pytest.mark.issue(3882)
def test_issue3882(en_vocab):
    """Test that displaCy doesn't serialize the doc.user_data when making a
    copy of the Doc.
    """
    doc = Doc(en_vocab, words=["Hello", "world"], deps=["dep", "dep"])
    doc.user_data["test"] = set()
    displacy.parse_deps(doc)


@pytest.mark.issue(5447)
def test_issue5447():
    """Test that overlapping arcs get separate levels, unless they're identical."""
    renderer = DependencyRenderer()
    words = [
        {"text": "This", "tag": "DT"},
        {"text": "is", "tag": "VBZ"},
        {"text": "a", "tag": "DT"},
        {"text": "sentence.", "tag": "NN"},
    ]
    arcs = [
        {"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
        {"start": 2, "end": 3, "label": "det", "dir": "left"},
        {"start": 2, "end": 3, "label": "overlap", "dir": "left"},
        {"end": 3, "label": "overlap", "start": 2, "dir": "left"},
        {"start": 1, "end": 3, "label": "attr", "dir": "left"},
    ]
    renderer.render([{"words": words, "arcs": arcs}])
    assert renderer.highest_level == 3


@pytest.mark.issue(5838)
def test_issue5838():
    # Displacy's EntityRenderer break line
    # not working after last entity
    sample_text = "First line\nSecond line, with ent\nThird line\nFourth line\n"
    nlp = English()
    doc = nlp(sample_text)
    doc.ents = [Span(doc, 7, 8, label="test")]
    html = displacy.render(doc, style="ent")
    found = html.count("<br>")
    assert found == 4


def test_displacy_parse_spans(en_vocab):
    """Test that spans on a Doc are converted into displaCy's format."""
    doc = Doc(en_vocab, words=["Welcome", "to", "the", "Bank", "of", "China"])
    doc.spans["sc"] = [Span(doc, 3, 6, "ORG"), Span(doc, 5, 6, "GPE")]
    spans = displacy.parse_spans(doc)
    assert isinstance(spans, dict)
    assert spans["text"] == "Welcome to the Bank of China "
    assert spans["spans"] == [
        {
            "start": 15,
            "end": 28,
            "start_token": 3,
            "end_token": 6,
            "label": "ORG",
            "kb_id": "",
            "kb_url": "#",
        },
        {
            "start": 23,
            "end": 28,
            "start_token": 5,
            "end_token": 6,
            "label": "GPE",
            "kb_id": "",
            "kb_url": "#",
        },
    ]


def test_displacy_parse_spans_with_kb_id_options(en_vocab):
    """Test that spans with kb_id on a Doc are converted into displaCy's format"""
    doc = Doc(en_vocab, words=["Welcome", "to", "the", "Bank", "of", "China"])
    doc.spans["sc"] = [
        Span(doc, 3, 6, "ORG", kb_id="Q790068"),
        Span(doc, 5, 6, "GPE", kb_id="Q148"),
    ]

    spans = displacy.parse_spans(
        doc, {"kb_url_template": "https://wikidata.org/wiki/{}"}
    )
    assert isinstance(spans, dict)
    assert spans["text"] == "Welcome to the Bank of China "
    assert spans["spans"] == [
        {
            "start": 15,
            "end": 28,
            "start_token": 3,
            "end_token": 6,
            "label": "ORG",
            "kb_id": "Q790068",
            "kb_url": "https://wikidata.org/wiki/Q790068",
        },
        {
            "start": 23,
            "end": 28,
            "start_token": 5,
            "end_token": 6,
            "label": "GPE",
            "kb_id": "Q148",
            "kb_url": "https://wikidata.org/wiki/Q148",
        },
    ]


def test_displacy_parse_spans_different_spans_key(en_vocab):
    """Test that spans in a different spans key will be parsed"""
    doc = Doc(en_vocab, words=["Welcome", "to", "the", "Bank", "of", "China"])
    doc.spans["sc"] = [Span(doc, 3, 6, "ORG"), Span(doc, 5, 6, "GPE")]
    doc.spans["custom"] = [Span(doc, 3, 6, "BANK")]
    spans = displacy.parse_spans(doc, options={"spans_key": "custom"})

    assert isinstance(spans, dict)
    assert spans["text"] == "Welcome to the Bank of China "
    assert spans["spans"] == [
        {
            "start": 15,
            "end": 28,
            "start_token": 3,
            "end_token": 6,
            "label": "BANK",
            "kb_id": "",
            "kb_url": "#",
        }
    ]


def test_displacy_parse_empty_spans_key(en_vocab):
    """Test that having an unset spans key doesn't raise an error"""
    doc = Doc(en_vocab, words=["Welcome", "to", "the", "Bank", "of", "China"])
    doc.spans["custom"] = [Span(doc, 3, 6, "BANK")]
    with pytest.warns(UserWarning, match="W117"):
        spans = displacy.parse_spans(doc)

    assert isinstance(spans, dict)


def test_displacy_parse_ents(en_vocab):
    """Test that named entities on a Doc are converted into displaCy's format."""
    doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
    doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
    ents = displacy.parse_ents(doc)
    assert isinstance(ents, dict)
    assert ents["text"] == "But Google is starting from behind "
    assert ents["ents"] == [
        {"start": 4, "end": 10, "label": "ORG", "kb_id": "", "kb_url": "#"}
    ]

    doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"], kb_id="Q95")]
    ents = displacy.parse_ents(doc)
    assert isinstance(ents, dict)
    assert ents["text"] == "But Google is starting from behind "
    assert ents["ents"] == [
        {"start": 4, "end": 10, "label": "ORG", "kb_id": "Q95", "kb_url": "#"}
    ]


def test_displacy_parse_ents_with_kb_id_options(en_vocab):
    """Test that named entities with kb_id on a Doc are converted into displaCy's format."""
    doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
    doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"], kb_id="Q95")]

    ents = displacy.parse_ents(
        doc, {"kb_url_template": "https://www.wikidata.org/wiki/{}"}
    )
    assert isinstance(ents, dict)
    assert ents["text"] == "But Google is starting from behind "
    assert ents["ents"] == [
        {
            "start": 4,
            "end": 10,
            "label": "ORG",
            "kb_id": "Q95",
            "kb_url": "https://www.wikidata.org/wiki/Q95",
        }
    ]


def test_displacy_parse_deps(en_vocab):
    """Test that deps and tags on a Doc are converted into displaCy's format."""
    words = ["This", "is", "a", "sentence"]
    heads = [1, 1, 3, 1]
    pos = ["DET", "VERB", "DET", "NOUN"]
    tags = ["DT", "VBZ", "DT", "NN"]
    deps = ["nsubj", "ROOT", "det", "attr"]
    doc = Doc(en_vocab, words=words, heads=heads, pos=pos, tags=tags, deps=deps)
    deps = displacy.parse_deps(doc)
    assert isinstance(deps, dict)
    assert deps["words"] == [
        {"lemma": None, "text": words[0], "tag": pos[0]},
        {"lemma": None, "text": words[1], "tag": pos[1]},
        {"lemma": None, "text": words[2], "tag": pos[2]},
        {"lemma": None, "text": words[3], "tag": pos[3]},
    ]
    assert deps["arcs"] == [
        {"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
        {"start": 2, "end": 3, "label": "det", "dir": "left"},
        {"start": 1, "end": 3, "label": "attr", "dir": "right"},
    ]
    # Test that displacy.parse_deps converts Span to Doc
    deps = displacy.parse_deps(doc[:])
    assert isinstance(deps, dict)
    assert deps["words"] == [
        {"lemma": None, "text": words[0], "tag": pos[0]},
        {"lemma": None, "text": words[1], "tag": pos[1]},
        {"lemma": None, "text": words[2], "tag": pos[2]},
        {"lemma": None, "text": words[3], "tag": pos[3]},
    ]
    assert deps["arcs"] == [
        {"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
        {"start": 2, "end": 3, "label": "det", "dir": "left"},
        {"start": 1, "end": 3, "label": "attr", "dir": "right"},
    ]


def test_displacy_invalid_arcs():
    renderer = DependencyRenderer()
    words = [{"text": "This", "tag": "DET"}, {"text": "is", "tag": "VERB"}]
    arcs = [
        {"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
        {"start": -1, "end": 2, "label": "det", "dir": "left"},
    ]
    with pytest.raises(ValueError):
        renderer.render([{"words": words, "arcs": arcs}])


def test_displacy_spans(en_vocab):
    """Test that displaCy can render Spans."""
    doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
    doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
    html = displacy.render(doc[1:4], style="ent")
    assert html.startswith("<div")


def test_displacy_raises_for_wrong_type(en_vocab):
    with pytest.raises(ValueError):
        displacy.render("hello world")


def test_displacy_rtl():
    # Source: http://www.sobhe.ir/hazm/ – is this correct?
    words = ["ما", "بسیار", "کتاب", "می\u200cخوانیم"]
    # These are (likely) wrong, but it's just for testing
    pos = ["PRO", "ADV", "N_PL", "V_SUB"]  # needs to match lang.fa.tag_map
    deps = ["foo", "bar", "foo", "baz"]
    heads = [1, 0, 3, 1]
    nlp = Persian()
    doc = Doc(nlp.vocab, words=words, tags=pos, heads=heads, deps=deps)
    doc.ents = [Span(doc, 1, 3, label="TEST")]
    html = displacy.render(doc, page=True, style="dep")
    assert "direction: rtl" in html
    assert 'direction="rtl"' in html
    assert f'lang="{nlp.lang}"' in html
    html = displacy.render(doc, page=True, style="ent")
    assert "direction: rtl" in html
    assert f'lang="{nlp.lang}"' in html


def test_displacy_render_wrapper(en_vocab):
    """Test that displaCy accepts custom rendering wrapper."""

    def wrapper(html):
        return "TEST" + html + "TEST"

    displacy.set_render_wrapper(wrapper)
    doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
    doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
    html = displacy.render(doc, style="ent")
    assert html.startswith("TEST<div")
    assert html.endswith("/div>TEST")
    # Restore
    displacy.set_render_wrapper(lambda html: html)


def test_displacy_render_manual_dep():
    """Test displacy.render with manual data for dep style"""
    parsed_dep = {
        "words": [
            {"text": "This", "tag": "DT"},
            {"text": "is", "tag": "VBZ"},
            {"text": "a", "tag": "DT"},
            {"text": "sentence", "tag": "NN"},
        ],
        "arcs": [
            {"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
            {"start": 2, "end": 3, "label": "det", "dir": "left"},
            {"start": 1, "end": 3, "label": "attr", "dir": "right"},
        ],
        "title": "Title",
    }
    html = displacy.render([parsed_dep], style="dep", manual=True)
    for word in parsed_dep["words"]:
        assert word["text"] in html
        assert word["tag"] in html


def test_displacy_render_manual_ent():
    """Test displacy.render with manual data for ent style"""
    parsed_ents = [
        {
            "text": "But Google is starting from behind.",
            "ents": [{"start": 4, "end": 10, "label": "ORG"}],
        },
        {
            "text": "But Google is starting from behind.",
            "ents": [{"start": -100, "end": 100, "label": "COMPANY"}],
            "title": "Title",
        },
    ]

    html = displacy.render(parsed_ents, style="ent", manual=True)
    for parsed_ent in parsed_ents:
        assert parsed_ent["ents"][0]["label"] in html
        if "title" in parsed_ent:
            assert parsed_ent["title"] in html


def test_displacy_render_manual_span():
    """Test displacy.render with manual data for span style"""
    parsed_spans = [
        {
            "text": "Welcome to the Bank of China.",
            "spans": [
                {"start_token": 3, "end_token": 6, "label": "ORG"},
                {"start_token": 5, "end_token": 6, "label": "GPE"},
            ],
            "tokens": ["Welcome", "to", "the", "Bank", "of", "China", "."],
        },
        {
            "text": "Welcome to the Bank of China.",
            "spans": [
                {"start_token": 3, "end_token": 6, "label": "ORG"},
                {"start_token": 5, "end_token": 6, "label": "GPE"},
            ],
            "tokens": ["Welcome", "to", "the", "Bank", "of", "China", "."],
            "title": "Title",
        },
    ]

    html = displacy.render(parsed_spans, style="span", manual=True)
    for parsed_span in parsed_spans:
        assert parsed_span["spans"][0]["label"] in html
        if "title" in parsed_span:
            assert parsed_span["title"] in html


def test_displacy_options_case():
    ents = ["foo", "BAR"]
    colors = {"FOO": "red", "bar": "green"}
    renderer = EntityRenderer({"ents": ents, "colors": colors})
    text = "abcd"
    labels = ["foo", "bar", "FOO", "BAR"]
    spans = [{"start": i, "end": i + 1, "label": labels[i]} for i in range(len(text))]
    result = renderer.render_ents("abcde", spans, None).split("\n\n")
    assert "red" in result[0] and "foo" in result[0]
    assert "green" in result[1] and "bar" in result[1]
    assert "red" in result[2] and "FOO" in result[2]
    assert "green" in result[3] and "BAR" in result[3]


@pytest.mark.issue(10672)
def test_displacy_manual_sorted_entities():
    doc = {
        "text": "But Google is starting from behind.",
        "ents": [
            {"start": 14, "end": 22, "label": "SECOND"},
            {"start": 4, "end": 10, "label": "FIRST"},
        ],
        "title": None,
    }

    html = displacy.render(doc, style="ent", manual=True)
    assert html.find("FIRST") < html.find("SECOND")


@pytest.mark.issue(12816)
def test_issue12816(en_vocab) -> None:
    """Test that displaCy's span visualizer escapes annotated HTML tags correctly."""
    # Create a doc containing an annotated word and an unannotated HTML tag
    doc = Doc(en_vocab, words=["test", "<TEST>"])
    doc.spans["sc"] = [Span(doc, 0, 1, label="test")]

    # Verify that the HTML tag is escaped when unannotated
    html = displacy.render(doc, style="span")
    assert "&lt;TEST&gt;" in html

    # Annotate the HTML tag
    doc.spans["sc"].append(Span(doc, 1, 2, label="test"))

    # Verify that the HTML tag is still escaped
    html = displacy.render(doc, style="span")
    assert "&lt;TEST&gt;" in html


@pytest.mark.issue(13056)
def test_displacy_span_stacking():
    """Test whether span stacking works properly for multiple overlapping spans."""
    spans = [
        {"start_token": 2, "end_token": 5, "label": "SkillNC"},
        {"start_token": 0, "end_token": 2, "label": "Skill"},
        {"start_token": 1, "end_token": 3, "label": "Skill"},
    ]
    tokens = ["Welcome", "to", "the", "Bank", "of", "China", "."]
    per_token_info = SpanRenderer._assemble_per_token_info(spans=spans, tokens=tokens)

    assert len(per_token_info) == len(tokens)
    assert all([len(per_token_info[i]["entities"]) == 1 for i in (0, 3, 4)])
    assert all([len(per_token_info[i]["entities"]) == 2 for i in (1, 2)])
    assert per_token_info[1]["entities"][0]["render_slot"] == 1
    assert per_token_info[1]["entities"][1]["render_slot"] == 2
    assert per_token_info[2]["entities"][0]["render_slot"] == 2
    assert per_token_info[2]["entities"][1]["render_slot"] == 3
-												Migrate regression tests into the main test suite (#9655)

* Migrate regressions 1-1000

* Move serialize test to correct file

* Remove tests that won't work in v3

* Migrate regressions 1000-1500

Removed regression test 1250 because v3 doesn't support the old LEX
scheme anymore.

* Add missing imports in serializer tests

* Migrate tests 1500-2000

* Migrate regressions from 2000-2500

* Migrate regressions from 2501-3000

* Migrate regressions from 3000-3501

* Migrate regressions from 3501-4000

* Migrate regressions from 4001-4500

* Migrate regressions from 4501-5000

* Migrate regressions from 5001-5501

* Migrate regressions from 5501 to 7000

* Migrate regressions from 7001 to 8000

* Migrate remaining regression tests

* Fixing missing imports

* Update docs with new system [ci skip]

* Update CONTRIBUTING.md

- Fix formatting
- Update wording

* Remove lemmatizer tests in el lang

* Move a few tests into the general tokenizer

* Separate Doc and DocBin tests
											
										
										
											2021-12-04 22:34:48 +03:00
+								import numpy
-												Move displaCy tests to own file

											
										
										
											2019-03-11 17:28:34 +03:00
+								import pytest
-												Displacy serve entity linking support without `manual=True` support. (#9748)

* Add support for kb_id to be displayed via displacy.serve. The current support is only limited to the manual option in displacy.render

* Commit to check pre-commit hooks are run.

* Update spacy/displacy/__init__.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Changes as per suggestions on the PR.

* Update website/docs/api/top-level.md

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update website/docs/api/top-level.md

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* tag option as new from 3.2.1 onwards

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
											
										
										
											2021-11-29 19:13:26 +03:00
-												Move displaCy tests to own file

											
										
										
											2019-03-11 17:28:34 +03:00
+								from spacy import displacy
-												Fix displacy span stacking (#13068)

* Fix displacy span stacking.

* Format. Remove counter.

* Remove test files.

* Add unit test. Refactor to allow for unit test.

* Fix off-by-one error in tests.
											
										
										
											2023-11-02 14:02:18 +03:00
+								from spacy.displacy.render import DependencyRenderer, EntityRenderer, SpanRenderer
-												Migrate regression tests into the main test suite (#9655)

* Migrate regressions 1-1000

* Move serialize test to correct file

* Remove tests that won't work in v3

* Migrate regressions 1000-1500

Removed regression test 1250 because v3 doesn't support the old LEX
scheme anymore.

* Add missing imports in serializer tests

* Migrate tests 1500-2000

* Migrate regressions from 2000-2500

* Migrate regressions from 2501-3000

* Migrate regressions from 3000-3501

* Migrate regressions from 3501-4000

* Migrate regressions from 4001-4500

* Migrate regressions from 4501-5000

* Migrate regressions from 5001-5501

* Migrate regressions from 5501 to 7000

* Migrate regressions from 7001 to 8000

* Migrate remaining regression tests

* Fixing missing imports

* Update docs with new system [ci skip]

* Update CONTRIBUTING.md

- Fix formatting
- Update wording

* Remove lemmatizer tests in el lang

* Move a few tests into the general tokenizer

* Separate Doc and DocBin tests
											
										
										
											2021-12-04 22:34:48 +03:00
+								from spacy.lang.en import English
-												💫 Fix displaCy support for RTL languages (#3393)

Closes #2091.

## Description

With the new `vocab.writing_system` property introduced in #3390 (exposed via the language defaults), I was able to finally fix this (I think!). Based on the `Doc`, dispaCy now detects whether it's a RTL or LTR language and adjusts the visualization accordingly. Wherever possible, I've also added `direction` and `lang` attributes.

Entity visualization now looks like this:

<img width="318" alt="Screenshot 2019-03-11 at 16 06 51" src="https://user-images.githubusercontent.com/13643239/54136866-d97afd80-441c-11e9-8c27-3d46994cc833.png">

And dependencies like this (ignore the most likely incorrect tags and dependencies):

<img width="621" alt="Screenshot 2019-03-11 at 16 51 59" src="https://user-images.githubusercontent.com/13643239/54137771-8b66f980-441e-11e9-8460-0682b95eef2a.png">

### Types of change
enhancement, bug fix

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-11 20:52:50 +03:00
+								from spacy.lang.fa import Persian
-												Configure isort to use the Black profile, recursively isort the `spacy` module (#12721)

* Use isort with Black profile

* isort all the things

* Fix import cycles as a result of import sorting

* Add DOCBIN_ALL_ATTRS type definition

* Add isort to requirements

* Remove isort from build dependencies check

* Typo
											
										
										
											2023-06-14 18:48:41 +03:00
+								from spacy.tokens import Doc, Span
-												Move displaCy tests to own file

											
										
										
											2019-03-11 17:28:34 +03:00
-												Migrate regression tests into the main test suite (#9655)

* Migrate regressions 1-1000

* Move serialize test to correct file

* Remove tests that won't work in v3

* Migrate regressions 1000-1500

Removed regression test 1250 because v3 doesn't support the old LEX
scheme anymore.

* Add missing imports in serializer tests

* Migrate tests 1500-2000

* Migrate regressions from 2000-2500

* Migrate regressions from 2501-3000

* Migrate regressions from 3000-3501

* Migrate regressions from 3501-4000

* Migrate regressions from 4001-4500

* Migrate regressions from 4501-5000

* Migrate regressions from 5001-5501

* Migrate regressions from 5501 to 7000

* Migrate regressions from 7001 to 8000

* Migrate remaining regression tests

* Fixing missing imports

* Update docs with new system [ci skip]

* Update CONTRIBUTING.md

- Fix formatting
- Update wording

* Remove lemmatizer tests in el lang

* Move a few tests into the general tokenizer

* Separate Doc and DocBin tests
											
										
										
											2021-12-04 22:34:48 +03:00
+								@pytest.mark.issue(2361)
 								def test_issue2361(de_vocab):
 								    """Test if < is escaped when rendering"""
 								    chars = ("&lt;", "&gt;", "&amp;", "&quot;")
 								    words = ["<", ">", "&", '"']
 								    doc = Doc(de_vocab, words=words, deps=["dep"] * len(words))
 								    html = displacy.render(doc)
 								    for char in chars:
 								        assert char in html
 								@pytest.mark.issue(2728)
 								def test_issue2728(en_vocab):
 								    """Test that displaCy ENT visualizer escapes HTML correctly."""
 								    doc = Doc(en_vocab, words=["test", "<RELEASE>", "test"])
 								    doc.ents = [Span(doc, 0, 1, label="TEST")]
 								    html = displacy.render(doc, style="ent")
 								    assert "&lt;RELEASE&gt;" in html
 								    doc.ents = [Span(doc, 1, 2, label="TEST")]
 								    html = displacy.render(doc, style="ent")
 								    assert "&lt;RELEASE&gt;" in html
 								@pytest.mark.issue(3288)
 								def test_issue3288(en_vocab):
 								    """Test that retokenization works correctly via displaCy when punctuation
 								    is merged onto the preceeding token and tensor is resized."""
 								    words = ["Hello", "World", "!", "When", "is", "this", "breaking", "?"]
 								    heads = [1, 1, 1, 4, 4, 6, 4, 4]
 								    deps = ["intj", "ROOT", "punct", "advmod", "ROOT", "det", "nsubj", "punct"]
 								    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
 								    doc.tensor = numpy.zeros((len(words), 96), dtype="float32")
 								    displacy.render(doc)
 								@pytest.mark.issue(3531)
 								def test_issue3531():
 								    """Test that displaCy renderer doesn't require "settings" key."""
 								    example_dep = {
 								        "words": [
 								            {"text": "But", "tag": "CCONJ"},
 								            {"text": "Google", "tag": "PROPN"},
 								            {"text": "is", "tag": "VERB"},
 								            {"text": "starting", "tag": "VERB"},
 								            {"text": "from", "tag": "ADP"},
 								            {"text": "behind.", "tag": "ADV"},
 								        ],
 								        "arcs": [
 								            {"start": 0, "end": 3, "label": "cc", "dir": "left"},
 								            {"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
 								            {"start": 2, "end": 3, "label": "aux", "dir": "left"},
 								            {"start": 3, "end": 4, "label": "prep", "dir": "right"},
 								            {"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
 								        ],
 								    }
 								    example_ent = {
 								        "text": "But Google is starting from behind.",
 								        "ents": [{"start": 4, "end": 10, "label": "ORG"}],
 								    }
 								    dep_html = displacy.render(example_dep, style="dep", manual=True)
 								    assert dep_html
 								    ent_html = displacy.render(example_ent, style="ent", manual=True)
 								    assert ent_html
 								@pytest.mark.issue(3882)
 								def test_issue3882(en_vocab):
 								    """Test that displaCy doesn't serialize the doc.user_data when making a
 								    copy of the Doc.
 								    """
 								    doc = Doc(en_vocab, words=["Hello", "world"], deps=["dep", "dep"])
 								    doc.user_data["test"] = set()
 								    displacy.parse_deps(doc)
-												displaCy: Avoid increasing levels for identical arcs (#10639)

* Test for arc levels for identical arcs

Also moves the test in order with the other numbered tests.

* displaCy: filter identical arcs

Avoid increased levels due to identical arcs by first
filtering any identical arcs.

* Sort keys before filtering

Manual entry with keys out of order would previously become
different tuples and therefore not filtered correctly.

Co-authored-by: Joachim Fainberg <joachimfainberg@Joachims-MBP.lan>
											
										
										
											2022-04-14 17:48:00 +03:00
+								@pytest.mark.issue(5447)
 								def test_issue5447():
 								    """Test that overlapping arcs get separate levels, unless they're identical."""
 								    renderer = DependencyRenderer()
 								    words = [
 								        {"text": "This", "tag": "DT"},
 								        {"text": "is", "tag": "VBZ"},
 								        {"text": "a", "tag": "DT"},
 								        {"text": "sentence.", "tag": "NN"},
 								    ]
 								    arcs = [
 								        {"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
 								        {"start": 2, "end": 3, "label": "det", "dir": "left"},
 								        {"start": 2, "end": 3, "label": "overlap", "dir": "left"},
 								        {"end": 3, "label": "overlap", "start": 2, "dir": "left"},
 								        {"start": 1, "end": 3, "label": "attr", "dir": "left"},
 								    ]
 								    renderer.render([{"words": words, "arcs": arcs}])
 								    assert renderer.highest_level == 3
-												Migrate regression tests into the main test suite (#9655)

* Migrate regressions 1-1000

* Move serialize test to correct file

* Remove tests that won't work in v3

* Migrate regressions 1000-1500

Removed regression test 1250 because v3 doesn't support the old LEX
scheme anymore.

* Add missing imports in serializer tests

* Migrate tests 1500-2000

* Migrate regressions from 2000-2500

* Migrate regressions from 2501-3000

* Migrate regressions from 3000-3501

* Migrate regressions from 3501-4000

* Migrate regressions from 4001-4500

* Migrate regressions from 4501-5000

* Migrate regressions from 5001-5501

* Migrate regressions from 5501 to 7000

* Migrate regressions from 7001 to 8000

* Migrate remaining regression tests

* Fixing missing imports

* Update docs with new system [ci skip]

* Update CONTRIBUTING.md

- Fix formatting
- Update wording

* Remove lemmatizer tests in el lang

* Move a few tests into the general tokenizer

* Separate Doc and DocBin tests
											
										
										
											2021-12-04 22:34:48 +03:00
+								@pytest.mark.issue(5838)
 								def test_issue5838():
 								    # Displacy's EntityRenderer break line
 								    # not working after last entity
 								    sample_text = "First line\nSecond line, with ent\nThird line\nFourth line\n"
 								    nlp = English()
 								    doc = nlp(sample_text)
 								    doc.ents = [Span(doc, 7, 8, label="test")]
 								    html = displacy.render(doc, style="ent")
-												Update br tags (#12882)

* Fix displacy br tag

* Prefer <br>, also update package CLI
											
										
										
											2023-08-04 11:52:41 +03:00
+								    found = html.count("<br>")
-												Migrate regression tests into the main test suite (#9655)

* Migrate regressions 1-1000

* Move serialize test to correct file

* Remove tests that won't work in v3

* Migrate regressions 1000-1500

Removed regression test 1250 because v3 doesn't support the old LEX
scheme anymore.

* Add missing imports in serializer tests

* Migrate tests 1500-2000

* Migrate regressions from 2000-2500

* Migrate regressions from 2501-3000

* Migrate regressions from 3000-3501

* Migrate regressions from 3501-4000

* Migrate regressions from 4001-4500

* Migrate regressions from 4501-5000

* Migrate regressions from 5001-5501

* Migrate regressions from 5501 to 7000

* Migrate regressions from 7001 to 8000

* Migrate remaining regression tests

* Fixing missing imports

* Update docs with new system [ci skip]

* Update CONTRIBUTING.md

- Fix formatting
- Update wording

* Remove lemmatizer tests in el lang

* Move a few tests into the general tokenizer

* Separate Doc and DocBin tests
											
										
										
											2021-12-04 22:34:48 +03:00
+								    assert found == 4
-												Add displacy support for overlapping Spans (#10332)

* Fix docstring for EntityRenderer

* Add warning in displacy if doc.spans are empty

* Implement parse_spans converter

One notable change here is that the default spans_key is sc, and
it's set by the user through the options.

* Implement SpanRenderer

Here, I implemented a SpanRenderer that looks similar to the
EntityRenderer except for some templates.  The spans_key, by default, is
set to sc, but can be configured in the options (see parse_spans). The
way I rendered these spans is per-token, i.e., I first check if each
token (1) belongs to a given span type and (2) a starting token of a
given span type. Once I have this information, I render them into the
markup.

* Fix mypy issues on typing

* Add tests for displacy spans support

* Update colors from RGB to hex

Co-authored-by: Ines Montani <ines@ines.io>

* Remove unnecessary CSS properties

* Add documentation for website

* Remove unnecesasry scripts

* Update wording on the documentation

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Put typing dependency on top of file

* Put back z-index so that spans overlap properly

* Make warning more explicit for spans_key

Co-authored-by: Ines Montani <ines@ines.io>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
											
										
										
											2022-03-16 20:14:34 +03:00
+								def test_displacy_parse_spans(en_vocab):
 								    """Test that spans on a Doc are converted into displaCy's format."""
 								    doc = Doc(en_vocab, words=["Welcome", "to", "the", "Bank", "of", "China"])
 								    doc.spans["sc"] = [Span(doc, 3, 6, "ORG"), Span(doc, 5, 6, "GPE")]
 								    spans = displacy.parse_spans(doc)
 								    assert isinstance(spans, dict)
 								    assert spans["text"] == "Welcome to the Bank of China "
 								    assert spans["spans"] == [
 								        {
 								            "start": 15,
 								            "end": 28,
 								            "start_token": 3,
 								            "end_token": 6,
 								            "label": "ORG",
 								            "kb_id": "",
 								            "kb_url": "#",
 								        },
 								        {
 								            "start": 23,
 								            "end": 28,
 								            "start_token": 5,
 								            "end_token": 6,
 								            "label": "GPE",
 								            "kb_id": "",
 								            "kb_url": "#",
 								        },
 								    ]
 								def test_displacy_parse_spans_with_kb_id_options(en_vocab):
 								    """Test that spans with kb_id on a Doc are converted into displaCy's format"""
 								    doc = Doc(en_vocab, words=["Welcome", "to", "the", "Bank", "of", "China"])
 								    doc.spans["sc"] = [
 								        Span(doc, 3, 6, "ORG", kb_id="Q790068"),
 								        Span(doc, 5, 6, "GPE", kb_id="Q148"),
 								    ]
 								    spans = displacy.parse_spans(
 								        doc, {"kb_url_template": "https://wikidata.org/wiki/{}"}
 								    )
 								    assert isinstance(spans, dict)
 								    assert spans["text"] == "Welcome to the Bank of China "
 								    assert spans["spans"] == [
 								        {
 								            "start": 15,
 								            "end": 28,
 								            "start_token": 3,
 								            "end_token": 6,
 								            "label": "ORG",
 								            "kb_id": "Q790068",
 								            "kb_url": "https://wikidata.org/wiki/Q790068",
 								        },
 								        {
 								            "start": 23,
 								            "end": 28,
 								            "start_token": 5,
 								            "end_token": 6,
 								            "label": "GPE",
 								            "kb_id": "Q148",
 								            "kb_url": "https://wikidata.org/wiki/Q148",
 								        },
 								    ]
 								def test_displacy_parse_spans_different_spans_key(en_vocab):
 								    """Test that spans in a different spans key will be parsed"""
 								    doc = Doc(en_vocab, words=["Welcome", "to", "the", "Bank", "of", "China"])
 								    doc.spans["sc"] = [Span(doc, 3, 6, "ORG"), Span(doc, 5, 6, "GPE")]
 								    doc.spans["custom"] = [Span(doc, 3, 6, "BANK")]
 								    spans = displacy.parse_spans(doc, options={"spans_key": "custom"})
 								    assert isinstance(spans, dict)
 								    assert spans["text"] == "Welcome to the Bank of China "
 								    assert spans["spans"] == [
 								        {
 								            "start": 15,
 								            "end": 28,
 								            "start_token": 3,
 								            "end_token": 6,
 								            "label": "BANK",
 								            "kb_id": "",
 								            "kb_url": "#",
 								        }
 								    ]
-												Don't throw an error if using displacy on an unset span key (#11845)

* Don't throw an error if using displacy on an unset span key

* List available keys in W117
											
										
										
											2022-11-28 12:01:09 +03:00
+								def test_displacy_parse_empty_spans_key(en_vocab):
 								    """Test that having an unset spans key doesn't raise an error"""
 								    doc = Doc(en_vocab, words=["Welcome", "to", "the", "Bank", "of", "China"])
 								    doc.spans["custom"] = [Span(doc, 3, 6, "BANK")]
 								    with pytest.warns(UserWarning, match="W117"):
 								        spans = displacy.parse_spans(doc)
 								    assert isinstance(spans, dict)
-												Move displaCy tests to own file

											
										
										
											2019-03-11 17:28:34 +03:00
+								def test_displacy_parse_ents(en_vocab):
 								    """Test that named entities on a Doc are converted into displaCy's format."""
-												Tidy up tests and docs

											
										
										
											2020-09-21 21:43:54 +03:00
+								    doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
-												Move displaCy tests to own file

											
										
										
											2019-03-11 17:28:34 +03:00
+								    doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
 								    ents = displacy.parse_ents(doc)
 								    assert isinstance(ents, dict)
 								    assert ents["text"] == "But Google is starting from behind "
-												Displacy serve entity linking support without `manual=True` support. (#9748)

* Add support for kb_id to be displayed via displacy.serve. The current support is only limited to the manual option in displacy.render

* Commit to check pre-commit hooks are run.

* Update spacy/displacy/__init__.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Changes as per suggestions on the PR.

* Update website/docs/api/top-level.md

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update website/docs/api/top-level.md

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* tag option as new from 3.2.1 onwards

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
											
										
										
											2021-11-29 19:13:26 +03:00
+								    assert ents["ents"] == [
 								        {"start": 4, "end": 10, "label": "ORG", "kb_id": "", "kb_url": "#"}
 								    ]
 								    doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"], kb_id="Q95")]
 								    ents = displacy.parse_ents(doc)
 								    assert isinstance(ents, dict)
 								    assert ents["text"] == "But Google is starting from behind "
 								    assert ents["ents"] == [
 								        {"start": 4, "end": 10, "label": "ORG", "kb_id": "Q95", "kb_url": "#"}
 								    ]
 								def test_displacy_parse_ents_with_kb_id_options(en_vocab):
 								    """Test that named entities with kb_id on a Doc are converted into displaCy's format."""
 								    doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
 								    doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"], kb_id="Q95")]
 								    ents = displacy.parse_ents(
 								        doc, {"kb_url_template": "https://www.wikidata.org/wiki/{}"}
 								    )
 								    assert isinstance(ents, dict)
 								    assert ents["text"] == "But Google is starting from behind "
 								    assert ents["ents"] == [
 								        {
 								            "start": 4,
 								            "end": 10,
 								            "label": "ORG",
 								            "kb_id": "Q95",
 								            "kb_url": "https://www.wikidata.org/wiki/Q95",
 								        }
 								    ]
-												Move displaCy tests to own file

											
										
										
											2019-03-11 17:28:34 +03:00
 								def test_displacy_parse_deps(en_vocab):
 								    """Test that deps and tags on a Doc are converted into displaCy's format."""
 								    words = ["This", "is", "a", "sentence"]
-												Tidy up tests and docs

											
										
										
											2020-09-21 21:43:54 +03:00
+								    heads = [1, 1, 3, 1]
-												Move displaCy tests to own file

											
										
										
											2019-03-11 17:28:34 +03:00
+								    pos = ["DET", "VERB", "DET", "NOUN"]
 								    tags = ["DT", "VBZ", "DT", "NN"]
 								    deps = ["nsubj", "ROOT", "det", "attr"]
-												Tidy up tests and docs

											
										
										
											2020-09-21 21:43:54 +03:00
+								    doc = Doc(en_vocab, words=words, heads=heads, pos=pos, tags=tags, deps=deps)
-												Move displaCy tests to own file

											
										
										
											2019-03-11 17:28:34 +03:00
+								    deps = displacy.parse_deps(doc)
 								    assert isinstance(deps, dict)
 								    assert deps["words"] == [
-												Bugfix/get doc (#5049)

* new (broken) unit test

* fixing get_doc method

											
										
										
											2020-03-02 13:49:28 +03:00
+								        {"lemma": None, "text": words[0], "tag": pos[0]},
 								        {"lemma": None, "text": words[1], "tag": pos[1]},
 								        {"lemma": None, "text": words[2], "tag": pos[2]},
 								        {"lemma": None, "text": words[3], "tag": pos[3]},
-												Move displaCy tests to own file

											
										
										
											2019-03-11 17:28:34 +03:00
+								    ]
 								    assert deps["arcs"] == [
 								        {"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
 								        {"start": 2, "end": 3, "label": "det", "dir": "left"},
 								        {"start": 1, "end": 3, "label": "attr", "dir": "right"},
 								    ]
-												Allow passing a Span to displacy.parse_deps (#12477)

* Allow passing a Span to displacy.parse_deps

* Update docstring

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Update API docs

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
											
										
										
											2023-03-31 10:44:01 +03:00
+								    # Test that displacy.parse_deps converts Span to Doc
 								    deps = displacy.parse_deps(doc[:])
 								    assert isinstance(deps, dict)
 								    assert deps["words"] == [
 								        {"lemma": None, "text": words[0], "tag": pos[0]},
 								        {"lemma": None, "text": words[1], "tag": pos[1]},
 								        {"lemma": None, "text": words[2], "tag": pos[2]},
 								        {"lemma": None, "text": words[3], "tag": pos[3]},
 								    ]
 								    assert deps["arcs"] == [
 								        {"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
 								        {"start": 2, "end": 3, "label": "det", "dir": "left"},
 								        {"start": 1, "end": 3, "label": "attr", "dir": "right"},
 								    ]
-												Move displaCy tests to own file

											
										
										
											2019-03-11 17:28:34 +03:00
-												Raise error for negative arc indices (closes #3917)

											
										
										
											2019-08-20 16:51:37 +03:00
+								def test_displacy_invalid_arcs():
 								    renderer = DependencyRenderer()
 								    words = [{"text": "This", "tag": "DET"}, {"text": "is", "tag": "VERB"}]
 								    arcs = [
 								        {"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
 								        {"start": -1, "end": 2, "label": "det", "dir": "left"},
 								    ]
 								    with pytest.raises(ValueError):
 								        renderer.render([{"words": words, "arcs": arcs}])
-												Move displaCy tests to own file

											
										
										
											2019-03-11 17:28:34 +03:00
+								def test_displacy_spans(en_vocab):
 								    """Test that displaCy can render Spans."""
-												Tidy up tests and docs

											
										
										
											2020-09-21 21:43:54 +03:00
+								    doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
-												Move displaCy tests to own file

											
										
										
											2019-03-11 17:28:34 +03:00
+								    doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
 								    html = displacy.render(doc[1:4], style="ent")
 								    assert html.startswith("<div")
 								def test_displacy_raises_for_wrong_type(en_vocab):
 								    with pytest.raises(ValueError):
 								        displacy.render("hello world")
-												💫 Fix displaCy support for RTL languages (#3393)

Closes #2091.

## Description

With the new `vocab.writing_system` property introduced in #3390 (exposed via the language defaults), I was able to finally fix this (I think!). Based on the `Doc`, dispaCy now detects whether it's a RTL or LTR language and adjusts the visualization accordingly. Wherever possible, I've also added `direction` and `lang` attributes.

Entity visualization now looks like this:

<img width="318" alt="Screenshot 2019-03-11 at 16 06 51" src="https://user-images.githubusercontent.com/13643239/54136866-d97afd80-441c-11e9-8c27-3d46994cc833.png">

And dependencies like this (ignore the most likely incorrect tags and dependencies):

<img width="621" alt="Screenshot 2019-03-11 at 16 51 59" src="https://user-images.githubusercontent.com/13643239/54137771-8b66f980-441e-11e9-8460-0682b95eef2a.png">

### Types of change
enhancement, bug fix

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-11 20:52:50 +03:00
 								def test_displacy_rtl():
 								    # Source: http://www.sobhe.ir/hazm/ – is this correct?
 								    words = ["ما", "بسیار", "کتاب", "می\u200cخوانیم"]
 								    # These are (likely) wrong, but it's just for testing
 								    pos = ["PRO", "ADV", "N_PL", "V_SUB"]  # needs to match lang.fa.tag_map
 								    deps = ["foo", "bar", "foo", "baz"]
-												Tidy up tests and docs

											
										
										
											2020-09-21 21:43:54 +03:00
+								    heads = [1, 0, 3, 1]
-												💫 Fix displaCy support for RTL languages (#3393)

Closes #2091.

## Description

With the new `vocab.writing_system` property introduced in #3390 (exposed via the language defaults), I was able to finally fix this (I think!). Based on the `Doc`, dispaCy now detects whether it's a RTL or LTR language and adjusts the visualization accordingly. Wherever possible, I've also added `direction` and `lang` attributes.

Entity visualization now looks like this:

<img width="318" alt="Screenshot 2019-03-11 at 16 06 51" src="https://user-images.githubusercontent.com/13643239/54136866-d97afd80-441c-11e9-8c27-3d46994cc833.png">

And dependencies like this (ignore the most likely incorrect tags and dependencies):

<img width="621" alt="Screenshot 2019-03-11 at 16 51 59" src="https://user-images.githubusercontent.com/13643239/54137771-8b66f980-441e-11e9-8460-0682b95eef2a.png">

### Types of change
enhancement, bug fix

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-11 20:52:50 +03:00
+								    nlp = Persian()
-												Tidy up tests and docs

											
										
										
											2020-09-21 21:43:54 +03:00
+								    doc = Doc(nlp.vocab, words=words, tags=pos, heads=heads, deps=deps)
-												💫 Fix displaCy support for RTL languages (#3393)

Closes #2091.

## Description

With the new `vocab.writing_system` property introduced in #3390 (exposed via the language defaults), I was able to finally fix this (I think!). Based on the `Doc`, dispaCy now detects whether it's a RTL or LTR language and adjusts the visualization accordingly. Wherever possible, I've also added `direction` and `lang` attributes.

Entity visualization now looks like this:

<img width="318" alt="Screenshot 2019-03-11 at 16 06 51" src="https://user-images.githubusercontent.com/13643239/54136866-d97afd80-441c-11e9-8c27-3d46994cc833.png">

And dependencies like this (ignore the most likely incorrect tags and dependencies):

<img width="621" alt="Screenshot 2019-03-11 at 16 51 59" src="https://user-images.githubusercontent.com/13643239/54137771-8b66f980-441e-11e9-8460-0682b95eef2a.png">

### Types of change
enhancement, bug fix

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-11 20:52:50 +03:00
+								    doc.ents = [Span(doc, 1, 3, label="TEST")]
 								    html = displacy.render(doc, page=True, style="dep")
 								    assert "direction: rtl" in html
 								    assert 'direction="rtl"' in html
-												Drop Python 2.7 and 3.5 (#4828)

* Remove unicode declarations

* Remove Python 3.5 and 2.7 from CI

* Don't require pathlib

* Replace compat helpers

* Remove OrderedDict

* Use f-strings

* Set Cython compiler language level

* Fix typo

* Re-add OrderedDict for Table

* Update setup.cfg

* Revert CONTRIBUTING.md

* Revert lookups.md

* Revert top-level.md

* Small adjustments and docs [ci skip]

											
										
										
											2019-12-22 03:53:56 +03:00
+								    assert f'lang="{nlp.lang}"' in html
-												💫 Fix displaCy support for RTL languages (#3393)

Closes #2091.

## Description

With the new `vocab.writing_system` property introduced in #3390 (exposed via the language defaults), I was able to finally fix this (I think!). Based on the `Doc`, dispaCy now detects whether it's a RTL or LTR language and adjusts the visualization accordingly. Wherever possible, I've also added `direction` and `lang` attributes.

Entity visualization now looks like this:

<img width="318" alt="Screenshot 2019-03-11 at 16 06 51" src="https://user-images.githubusercontent.com/13643239/54136866-d97afd80-441c-11e9-8c27-3d46994cc833.png">

And dependencies like this (ignore the most likely incorrect tags and dependencies):

<img width="621" alt="Screenshot 2019-03-11 at 16 51 59" src="https://user-images.githubusercontent.com/13643239/54137771-8b66f980-441e-11e9-8460-0682b95eef2a.png">

### Types of change
enhancement, bug fix

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-11 20:52:50 +03:00
+								    html = displacy.render(doc, page=True, style="ent")
 								    assert "direction: rtl" in html
-												Drop Python 2.7 and 3.5 (#4828)

* Remove unicode declarations

* Remove Python 3.5 and 2.7 from CI

* Don't require pathlib

* Replace compat helpers

* Remove OrderedDict

* Use f-strings

* Set Cython compiler language level

* Fix typo

* Re-add OrderedDict for Table

* Update setup.cfg

* Revert CONTRIBUTING.md

* Revert lookups.md

* Revert top-level.md

* Small adjustments and docs [ci skip]

											
										
										
											2019-12-22 03:53:56 +03:00
+								    assert f'lang="{nlp.lang}"' in html
-												Update test_displacy.py

											
										
										
											2019-03-11 21:03:52 +03:00
-												Auto-format [ci skip]

											
										
										
											2019-03-12 15:35:34 +03:00
-												Update test_displacy.py

											
										
										
											2019-03-11 21:03:52 +03:00
+								def test_displacy_render_wrapper(en_vocab):
 								    """Test that displaCy accepts custom rendering wrapper."""
 								    def wrapper(html):
 								        return "TEST" + html + "TEST"
 								    displacy.set_render_wrapper(wrapper)
-												Tidy up tests and docs

											
										
										
											2020-09-21 21:43:54 +03:00
+								    doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
-												Update test_displacy.py

											
										
										
											2019-03-11 21:03:52 +03:00
+								    doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
 								    html = displacy.render(doc, style="ent")
 								    assert html.startswith("TEST<div")
 								    assert html.endswith("/div>TEST")
-												Auto-format [ci skip]

											
										
										
											2019-03-12 15:35:34 +03:00
+								    # Restore
 								    displacy.set_render_wrapper(lambda html: html)
-												Adjust label casing in displaCy NER visualizer (resolves #4866)

- Accept any case for label names in ents and colors option, even if actual predicted label uses different casing
- Don't text-transform: uppercase visually, if it's important to users that the label is represented as-is in the UI

											
										
										
											2020-08-21 12:51:31 +03:00
-												Add displaCy data structures to docs (2) (#12875)

* Add data structures to docs

* Adjusted descriptions for more consistency

* Add _optional_ flag to parameters

* Add tests and adjust optional title key in doc

* Add title to dep visualizations

* fix typo

---------

Co-authored-by: thomashacker <EdwardSchmuhl@web.de>
											
										
										
											2023-07-31 11:47:57 +03:00
+								def test_displacy_render_manual_dep():
 								    """Test displacy.render with manual data for dep style"""
 								    parsed_dep = {
 								        "words": [
 								            {"text": "This", "tag": "DT"},
 								            {"text": "is", "tag": "VBZ"},
 								            {"text": "a", "tag": "DT"},
 								            {"text": "sentence", "tag": "NN"},
 								        ],
 								        "arcs": [
 								            {"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
 								            {"start": 2, "end": 3, "label": "det", "dir": "left"},
 								            {"start": 1, "end": 3, "label": "attr", "dir": "right"},
 								        ],
 								        "title": "Title",
 								    }
 								    html = displacy.render([parsed_dep], style="dep", manual=True)
 								    for word in parsed_dep["words"]:
 								        assert word["text"] in html
 								        assert word["tag"] in html
 								def test_displacy_render_manual_ent():
 								    """Test displacy.render with manual data for ent style"""
 								    parsed_ents = [
 								        {
 								            "text": "But Google is starting from behind.",
 								            "ents": [{"start": 4, "end": 10, "label": "ORG"}],
 								        },
 								        {
 								            "text": "But Google is starting from behind.",
 								            "ents": [{"start": -100, "end": 100, "label": "COMPANY"}],
 								            "title": "Title",
 								        },
 								    ]
 								    html = displacy.render(parsed_ents, style="ent", manual=True)
 								    for parsed_ent in parsed_ents:
 								        assert parsed_ent["ents"][0]["label"] in html
 								        if "title" in parsed_ent:
 								            assert parsed_ent["title"] in html
 								def test_displacy_render_manual_span():
 								    """Test displacy.render with manual data for span style"""
 								    parsed_spans = [
 								        {
 								            "text": "Welcome to the Bank of China.",
 								            "spans": [
 								                {"start_token": 3, "end_token": 6, "label": "ORG"},
 								                {"start_token": 5, "end_token": 6, "label": "GPE"},
 								            ],
 								            "tokens": ["Welcome", "to", "the", "Bank", "of", "China", "."],
 								        },
 								        {
 								            "text": "Welcome to the Bank of China.",
 								            "spans": [
 								                {"start_token": 3, "end_token": 6, "label": "ORG"},
 								                {"start_token": 5, "end_token": 6, "label": "GPE"},
 								            ],
 								            "tokens": ["Welcome", "to", "the", "Bank", "of", "China", "."],
 								            "title": "Title",
 								        },
 								    ]
 								    html = displacy.render(parsed_spans, style="span", manual=True)
 								    for parsed_span in parsed_spans:
 								        assert parsed_span["spans"][0]["label"] in html
 								        if "title" in parsed_span:
 								            assert parsed_span["title"] in html
-												Adjust label casing in displaCy NER visualizer (resolves #4866)

- Accept any case for label names in ents and colors option, even if actual predicted label uses different casing
- Don't text-transform: uppercase visually, if it's important to users that the label is represented as-is in the UI

											
										
										
											2020-08-21 12:51:31 +03:00
+								def test_displacy_options_case():
 								    ents = ["foo", "BAR"]
 								    colors = {"FOO": "red", "bar": "green"}
 								    renderer = EntityRenderer({"ents": ents, "colors": colors})
 								    text = "abcd"
 								    labels = ["foo", "bar", "FOO", "BAR"]
 								    spans = [{"start": i, "end": i + 1, "label": labels[i]} for i in range(len(text))]
 								    result = renderer.render_ents("abcde", spans, None).split("\n\n")
 								    assert "red" in result[0] and "foo" in result[0]
 								    assert "green" in result[1] and "bar" in result[1]
 								    assert "red" in result[2] and "FOO" in result[2]
 								    assert "green" in result[3] and "BAR" in result[3]
-												#10672: fixes displacy output for manual unsorted entities (#10673)

* #10672: fixes displacy output for manual unsorted entities

* #10672: removed unused import

* fix prettier formatting

Co-authored-by: Harm Buisman <h.buisman@iknl.nl>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
											
										
										
											2022-04-27 10:51:58 +03:00
 								@pytest.mark.issue(10672)
 								def test_displacy_manual_sorted_entities():
 								    doc = {
 								        "text": "But Google is starting from behind.",
 								        "ents": [
 								            {"start": 14, "end": 22, "label": "SECOND"},
 								            {"start": 4, "end": 10, "label": "FIRST"},
 								        ],
 								        "title": None,
 								    }
 								    html = displacy.render(doc, style="ent", manual=True)
 								    assert html.find("FIRST") < html.find("SECOND")
-												🐛 Escape annotated HTML tags in span renderer (#12817)

These changes add a missing call to `escape_html` in the displaCy span
renderer. Previously span-annotated tokens would be inserted into the
page markup without being escaped, resulting in potentially incorrect
rendering. When I encountered this issue, it resulted in some docs and
span underlines being superimposed on top of properly rendered docs and
span underlines near the beginning of the visualization (due to an
unescaped `<span>` tag).
											
										
										
											2023-07-13 18:33:05 +03:00
 								@pytest.mark.issue(12816)
 								def test_issue12816(en_vocab) -> None:
 								    """Test that displaCy's span visualizer escapes annotated HTML tags correctly."""
 								    # Create a doc containing an annotated word and an unannotated HTML tag
 								    doc = Doc(en_vocab, words=["test", "<TEST>"])
 								    doc.spans["sc"] = [Span(doc, 0, 1, label="test")]
 								    # Verify that the HTML tag is escaped when unannotated
 								    html = displacy.render(doc, style="span")
 								    assert "&lt;TEST&gt;" in html
 								    # Annotate the HTML tag
 								    doc.spans["sc"].append(Span(doc, 1, 2, label="test"))
 								    # Verify that the HTML tag is still escaped
 								    html = displacy.render(doc, style="span")
 								    assert "&lt;TEST&gt;" in html
-												Fix displacy span stacking (#13068)

* Fix displacy span stacking.

* Format. Remove counter.

* Remove test files.

* Add unit test. Refactor to allow for unit test.

* Fix off-by-one error in tests.
											
										
										
											2023-11-02 14:02:18 +03:00
 								@pytest.mark.issue(13056)
 								def test_displacy_span_stacking():
 								    """Test whether span stacking works properly for multiple overlapping spans."""
 								    spans = [
 								        {"start_token": 2, "end_token": 5, "label": "SkillNC"},
 								        {"start_token": 0, "end_token": 2, "label": "Skill"},
 								        {"start_token": 1, "end_token": 3, "label": "Skill"},
 								    ]
 								    tokens = ["Welcome", "to", "the", "Bank", "of", "China", "."]
 								    per_token_info = SpanRenderer._assemble_per_token_info(spans=spans, tokens=tokens)
 								    assert len(per_token_info) == len(tokens)
 								    assert all([len(per_token_info[i]["entities"]) == 1 for i in (0, 3, 4)])
 								    assert all([len(per_token_info[i]["entities"]) == 2 for i in (1, 2)])
 								    assert per_token_info[1]["entities"][0]["render_slot"] == 1
 								    assert per_token_info[1]["entities"][1]["render_slot"] == 2
 								    assert per_token_info[2]["entities"][0]["render_slot"] == 2
 								    assert per_token_info[2]["entities"][1]["render_slot"] == 3