spaCy/spacy/tests/test_displacy.py

import numpy
import pytest

from spacy import displacy
from spacy.displacy.render import DependencyRenderer, EntityRenderer
from spacy.lang.en import English
from spacy.lang.fa import Persian
from spacy.tokens import Span, Doc


@pytest.mark.issue(2361)
def test_issue2361(de_vocab):
    """Test if < is escaped when rendering"""
    chars = ("&lt;", "&gt;", "&amp;", "&quot;")
    words = ["<", ">", "&", '"']
    doc = Doc(de_vocab, words=words, deps=["dep"] * len(words))
    html = displacy.render(doc)
    for char in chars:
        assert char in html


@pytest.mark.issue(2728)
def test_issue2728(en_vocab):
    """Test that displaCy ENT visualizer escapes HTML correctly."""
    doc = Doc(en_vocab, words=["test", "<RELEASE>", "test"])
    doc.ents = [Span(doc, 0, 1, label="TEST")]
    html = displacy.render(doc, style="ent")
    assert "&lt;RELEASE&gt;" in html
    doc.ents = [Span(doc, 1, 2, label="TEST")]
    html = displacy.render(doc, style="ent")
    assert "&lt;RELEASE&gt;" in html


@pytest.mark.issue(3288)
def test_issue3288(en_vocab):
    """Test that retokenization works correctly via displaCy when punctuation
    is merged onto the preceeding token and tensor is resized."""
    words = ["Hello", "World", "!", "When", "is", "this", "breaking", "?"]
    heads = [1, 1, 1, 4, 4, 6, 4, 4]
    deps = ["intj", "ROOT", "punct", "advmod", "ROOT", "det", "nsubj", "punct"]
    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
    doc.tensor = numpy.zeros((len(words), 96), dtype="float32")
    displacy.render(doc)


@pytest.mark.issue(3531)
def test_issue3531():
    """Test that displaCy renderer doesn't require "settings" key."""
    example_dep = {
        "words": [
            {"text": "But", "tag": "CCONJ"},
            {"text": "Google", "tag": "PROPN"},
            {"text": "is", "tag": "VERB"},
            {"text": "starting", "tag": "VERB"},
            {"text": "from", "tag": "ADP"},
            {"text": "behind.", "tag": "ADV"},
        ],
        "arcs": [
            {"start": 0, "end": 3, "label": "cc", "dir": "left"},
            {"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
            {"start": 2, "end": 3, "label": "aux", "dir": "left"},
            {"start": 3, "end": 4, "label": "prep", "dir": "right"},
            {"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
        ],
    }
    example_ent = {
        "text": "But Google is starting from behind.",
        "ents": [{"start": 4, "end": 10, "label": "ORG"}],
    }
    dep_html = displacy.render(example_dep, style="dep", manual=True)
    assert dep_html
    ent_html = displacy.render(example_ent, style="ent", manual=True)
    assert ent_html


@pytest.mark.issue(3882)
def test_issue3882(en_vocab):
    """Test that displaCy doesn't serialize the doc.user_data when making a
    copy of the Doc.
    """
    doc = Doc(en_vocab, words=["Hello", "world"], deps=["dep", "dep"])
    doc.user_data["test"] = set()
    displacy.parse_deps(doc)


@pytest.mark.issue(5447)
def test_issue5447():
    """Test that overlapping arcs get separate levels, unless they're identical."""
    renderer = DependencyRenderer()
    words = [
        {"text": "This", "tag": "DT"},
        {"text": "is", "tag": "VBZ"},
        {"text": "a", "tag": "DT"},
        {"text": "sentence.", "tag": "NN"},
    ]
    arcs = [
        {"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
        {"start": 2, "end": 3, "label": "det", "dir": "left"},
        {"start": 2, "end": 3, "label": "overlap", "dir": "left"},
        {"end": 3, "label": "overlap", "start": 2, "dir": "left"},
        {"start": 1, "end": 3, "label": "attr", "dir": "left"},
    ]
    renderer.render([{"words": words, "arcs": arcs}])
    assert renderer.highest_level == 3


@pytest.mark.issue(5838)
def test_issue5838():
    # Displacy's EntityRenderer break line
    # not working after last entity
    sample_text = "First line\nSecond line, with ent\nThird line\nFourth line\n"
    nlp = English()
    doc = nlp(sample_text)
    doc.ents = [Span(doc, 7, 8, label="test")]
    html = displacy.render(doc, style="ent")
    found = html.count("</br>")
    assert found == 4


def test_displacy_parse_spans(en_vocab):
    """Test that spans on a Doc are converted into displaCy's format."""
    doc = Doc(en_vocab, words=["Welcome", "to", "the", "Bank", "of", "China"])
    doc.spans["sc"] = [Span(doc, 3, 6, "ORG"), Span(doc, 5, 6, "GPE")]
    spans = displacy.parse_spans(doc)
    assert isinstance(spans, dict)
    assert spans["text"] == "Welcome to the Bank of China "
    assert spans["spans"] == [
        {
            "start": 15,
            "end": 28,
            "start_token": 3,
            "end_token": 6,
            "label": "ORG",
            "kb_id": "",
            "kb_url": "#",
        },
        {
            "start": 23,
            "end": 28,
            "start_token": 5,
            "end_token": 6,
            "label": "GPE",
            "kb_id": "",
            "kb_url": "#",
        },
    ]


def test_displacy_parse_spans_with_kb_id_options(en_vocab):
    """Test that spans with kb_id on a Doc are converted into displaCy's format"""
    doc = Doc(en_vocab, words=["Welcome", "to", "the", "Bank", "of", "China"])
    doc.spans["sc"] = [
        Span(doc, 3, 6, "ORG", kb_id="Q790068"),
        Span(doc, 5, 6, "GPE", kb_id="Q148"),
    ]

    spans = displacy.parse_spans(
        doc, {"kb_url_template": "https://wikidata.org/wiki/{}"}
    )
    assert isinstance(spans, dict)
    assert spans["text"] == "Welcome to the Bank of China "
    assert spans["spans"] == [
        {
            "start": 15,
            "end": 28,
            "start_token": 3,
            "end_token": 6,
            "label": "ORG",
            "kb_id": "Q790068",
            "kb_url": "https://wikidata.org/wiki/Q790068",
        },
        {
            "start": 23,
            "end": 28,
            "start_token": 5,
            "end_token": 6,
            "label": "GPE",
            "kb_id": "Q148",
            "kb_url": "https://wikidata.org/wiki/Q148",
        },
    ]


def test_displacy_parse_spans_different_spans_key(en_vocab):
    """Test that spans in a different spans key will be parsed"""
    doc = Doc(en_vocab, words=["Welcome", "to", "the", "Bank", "of", "China"])
    doc.spans["sc"] = [Span(doc, 3, 6, "ORG"), Span(doc, 5, 6, "GPE")]
    doc.spans["custom"] = [Span(doc, 3, 6, "BANK")]
    spans = displacy.parse_spans(doc, options={"spans_key": "custom"})

    assert isinstance(spans, dict)
    assert spans["text"] == "Welcome to the Bank of China "
    assert spans["spans"] == [
        {
            "start": 15,
            "end": 28,
            "start_token": 3,
            "end_token": 6,
            "label": "BANK",
            "kb_id": "",
            "kb_url": "#",
        }
    ]


def test_displacy_parse_ents(en_vocab):
    """Test that named entities on a Doc are converted into displaCy's format."""
    doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
    doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
    ents = displacy.parse_ents(doc)
    assert isinstance(ents, dict)
    assert ents["text"] == "But Google is starting from behind "
    assert ents["ents"] == [
        {"start": 4, "end": 10, "label": "ORG", "kb_id": "", "kb_url": "#"}
    ]

    doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"], kb_id="Q95")]
    ents = displacy.parse_ents(doc)
    assert isinstance(ents, dict)
    assert ents["text"] == "But Google is starting from behind "
    assert ents["ents"] == [
        {"start": 4, "end": 10, "label": "ORG", "kb_id": "Q95", "kb_url": "#"}
    ]


def test_displacy_parse_ents_with_kb_id_options(en_vocab):
    """Test that named entities with kb_id on a Doc are converted into displaCy's format."""
    doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
    doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"], kb_id="Q95")]

    ents = displacy.parse_ents(
        doc, {"kb_url_template": "https://www.wikidata.org/wiki/{}"}
    )
    assert isinstance(ents, dict)
    assert ents["text"] == "But Google is starting from behind "
    assert ents["ents"] == [
        {
            "start": 4,
            "end": 10,
            "label": "ORG",
            "kb_id": "Q95",
            "kb_url": "https://www.wikidata.org/wiki/Q95",
        }
    ]


def test_displacy_parse_deps(en_vocab):
    """Test that deps and tags on a Doc are converted into displaCy's format."""
    words = ["This", "is", "a", "sentence"]
    heads = [1, 1, 3, 1]
    pos = ["DET", "VERB", "DET", "NOUN"]
    tags = ["DT", "VBZ", "DT", "NN"]
    deps = ["nsubj", "ROOT", "det", "attr"]
    doc = Doc(en_vocab, words=words, heads=heads, pos=pos, tags=tags, deps=deps)
    deps = displacy.parse_deps(doc)
    assert isinstance(deps, dict)
    assert deps["words"] == [
        {"lemma": None, "text": words[0], "tag": pos[0]},
        {"lemma": None, "text": words[1], "tag": pos[1]},
        {"lemma": None, "text": words[2], "tag": pos[2]},
        {"lemma": None, "text": words[3], "tag": pos[3]},
    ]
    assert deps["arcs"] == [
        {"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
        {"start": 2, "end": 3, "label": "det", "dir": "left"},
        {"start": 1, "end": 3, "label": "attr", "dir": "right"},
    ]


def test_displacy_invalid_arcs():
    renderer = DependencyRenderer()
    words = [{"text": "This", "tag": "DET"}, {"text": "is", "tag": "VERB"}]
    arcs = [
        {"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
        {"start": -1, "end": 2, "label": "det", "dir": "left"},
    ]
    with pytest.raises(ValueError):
        renderer.render([{"words": words, "arcs": arcs}])


def test_displacy_spans(en_vocab):
    """Test that displaCy can render Spans."""
    doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
    doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
    html = displacy.render(doc[1:4], style="ent")
    assert html.startswith("<div")


def test_displacy_raises_for_wrong_type(en_vocab):
    with pytest.raises(ValueError):
        displacy.render("hello world")


def test_displacy_rtl():
    # Source: http://www.sobhe.ir/hazm/ – is this correct?
    words = ["ما", "بسیار", "کتاب", "می\u200cخوانیم"]
    # These are (likely) wrong, but it's just for testing
    pos = ["PRO", "ADV", "N_PL", "V_SUB"]  # needs to match lang.fa.tag_map
    deps = ["foo", "bar", "foo", "baz"]
    heads = [1, 0, 3, 1]
    nlp = Persian()
    doc = Doc(nlp.vocab, words=words, tags=pos, heads=heads, deps=deps)
    doc.ents = [Span(doc, 1, 3, label="TEST")]
    html = displacy.render(doc, page=True, style="dep")
    assert "direction: rtl" in html
    assert 'direction="rtl"' in html
    assert f'lang="{nlp.lang}"' in html
    html = displacy.render(doc, page=True, style="ent")
    assert "direction: rtl" in html
    assert f'lang="{nlp.lang}"' in html


def test_displacy_render_wrapper(en_vocab):
    """Test that displaCy accepts custom rendering wrapper."""

    def wrapper(html):
        return "TEST" + html + "TEST"

    displacy.set_render_wrapper(wrapper)
    doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
    doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
    html = displacy.render(doc, style="ent")
    assert html.startswith("TEST<div")
    assert html.endswith("/div>TEST")
    # Restore
    displacy.set_render_wrapper(lambda html: html)


def test_displacy_options_case():
    ents = ["foo", "BAR"]
    colors = {"FOO": "red", "bar": "green"}
    renderer = EntityRenderer({"ents": ents, "colors": colors})
    text = "abcd"
    labels = ["foo", "bar", "FOO", "BAR"]
    spans = [{"start": i, "end": i + 1, "label": labels[i]} for i in range(len(text))]
    result = renderer.render_ents("abcde", spans, None).split("\n\n")
    assert "red" in result[0] and "foo" in result[0]
    assert "green" in result[1] and "bar" in result[1]
    assert "red" in result[2] and "FOO" in result[2]
    assert "green" in result[3] and "BAR" in result[3]


@pytest.mark.issue(10672)
def test_displacy_manual_sorted_entities():
    doc = {
        "text": "But Google is starting from behind.",
        "ents": [
            {"start": 14, "end": 22, "label": "SECOND"},
            {"start": 4, "end": 10, "label": "FIRST"},
        ],
        "title": None,
    }

    html = displacy.render(doc, style="ent", manual=True)
    assert html.find("FIRST") < html.find("SECOND")
-												Migrate regression tests into the main test suite (#9655)

* Migrate regressions 1-1000

* Move serialize test to correct file

* Remove tests that won't work in v3

* Migrate regressions 1000-1500

Removed regression test 1250 because v3 doesn't support the old LEX
scheme anymore.

* Add missing imports in serializer tests

* Migrate tests 1500-2000

* Migrate regressions from 2000-2500

* Migrate regressions from 2501-3000

* Migrate regressions from 3000-3501

* Migrate regressions from 3501-4000

* Migrate regressions from 4001-4500

* Migrate regressions from 4501-5000

* Migrate regressions from 5001-5501

* Migrate regressions from 5501 to 7000

* Migrate regressions from 7001 to 8000

* Migrate remaining regression tests

* Fixing missing imports

* Update docs with new system [ci skip]

* Update CONTRIBUTING.md

- Fix formatting
- Update wording

* Remove lemmatizer tests in el lang

* Move a few tests into the general tokenizer

* Separate Doc and DocBin tests
											
										
										
											2021-12-04 22:34:48 +03:00
+								import numpy
-												Move displaCy tests to own file

											
										
										
											2019-03-11 17:28:34 +03:00
+								import pytest
-												Displacy serve entity linking support without `manual=True` support. (#9748)

* Add support for kb_id to be displayed via displacy.serve. The current support is only limited to the manual option in displacy.render

* Commit to check pre-commit hooks are run.

* Update spacy/displacy/__init__.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Changes as per suggestions on the PR.

* Update website/docs/api/top-level.md

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update website/docs/api/top-level.md

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* tag option as new from 3.2.1 onwards

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
											
										
										
											2021-11-29 19:13:26 +03:00
-												Move displaCy tests to own file

											
										
										
											2019-03-11 17:28:34 +03:00
+								from spacy import displacy
-												Adjust label casing in displaCy NER visualizer (resolves #4866)

- Accept any case for label names in ents and colors option, even if actual predicted label uses different casing
- Don't text-transform: uppercase visually, if it's important to users that the label is represented as-is in the UI

											
										
										
											2020-08-21 12:51:31 +03:00
+								from spacy.displacy.render import DependencyRenderer, EntityRenderer
-												Migrate regression tests into the main test suite (#9655)

* Migrate regressions 1-1000

* Move serialize test to correct file

* Remove tests that won't work in v3

* Migrate regressions 1000-1500

Removed regression test 1250 because v3 doesn't support the old LEX
scheme anymore.

* Add missing imports in serializer tests

* Migrate tests 1500-2000

* Migrate regressions from 2000-2500

* Migrate regressions from 2501-3000

* Migrate regressions from 3000-3501

* Migrate regressions from 3501-4000

* Migrate regressions from 4001-4500

* Migrate regressions from 4501-5000

* Migrate regressions from 5001-5501

* Migrate regressions from 5501 to 7000

* Migrate regressions from 7001 to 8000

* Migrate remaining regression tests

* Fixing missing imports

* Update docs with new system [ci skip]

* Update CONTRIBUTING.md

- Fix formatting
- Update wording

* Remove lemmatizer tests in el lang

* Move a few tests into the general tokenizer

* Separate Doc and DocBin tests
											
										
										
											2021-12-04 22:34:48 +03:00
+								from spacy.lang.en import English
-												💫 Fix displaCy support for RTL languages (#3393)

Closes #2091.

## Description

With the new `vocab.writing_system` property introduced in #3390 (exposed via the language defaults), I was able to finally fix this (I think!). Based on the `Doc`, dispaCy now detects whether it's a RTL or LTR language and adjusts the visualization accordingly. Wherever possible, I've also added `direction` and `lang` attributes.

Entity visualization now looks like this:

<img width="318" alt="Screenshot 2019-03-11 at 16 06 51" src="https://user-images.githubusercontent.com/13643239/54136866-d97afd80-441c-11e9-8c27-3d46994cc833.png">

And dependencies like this (ignore the most likely incorrect tags and dependencies):

<img width="621" alt="Screenshot 2019-03-11 at 16 51 59" src="https://user-images.githubusercontent.com/13643239/54137771-8b66f980-441e-11e9-8460-0682b95eef2a.png">

### Types of change
enhancement, bug fix

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-11 20:52:50 +03:00
+								from spacy.lang.fa import Persian
-												Displacy serve entity linking support without `manual=True` support. (#9748)

* Add support for kb_id to be displayed via displacy.serve. The current support is only limited to the manual option in displacy.render

* Commit to check pre-commit hooks are run.

* Update spacy/displacy/__init__.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Changes as per suggestions on the PR.

* Update website/docs/api/top-level.md

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update website/docs/api/top-level.md

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* tag option as new from 3.2.1 onwards

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
											
										
										
											2021-11-29 19:13:26 +03:00
+								from spacy.tokens import Span, Doc
-												Move displaCy tests to own file

											
										
										
											2019-03-11 17:28:34 +03:00
-												Migrate regression tests into the main test suite (#9655)

* Migrate regressions 1-1000

* Move serialize test to correct file

* Remove tests that won't work in v3

* Migrate regressions 1000-1500

Removed regression test 1250 because v3 doesn't support the old LEX
scheme anymore.

* Add missing imports in serializer tests

* Migrate tests 1500-2000

* Migrate regressions from 2000-2500

* Migrate regressions from 2501-3000

* Migrate regressions from 3000-3501

* Migrate regressions from 3501-4000

* Migrate regressions from 4001-4500

* Migrate regressions from 4501-5000

* Migrate regressions from 5001-5501

* Migrate regressions from 5501 to 7000

* Migrate regressions from 7001 to 8000

* Migrate remaining regression tests

* Fixing missing imports

* Update docs with new system [ci skip]

* Update CONTRIBUTING.md

- Fix formatting
- Update wording

* Remove lemmatizer tests in el lang

* Move a few tests into the general tokenizer

* Separate Doc and DocBin tests
											
										
										
											2021-12-04 22:34:48 +03:00
+								@pytest.mark.issue(2361)
 								def test_issue2361(de_vocab):
 								    """Test if < is escaped when rendering"""
 								    chars = ("&lt;", "&gt;", "&amp;", "&quot;")
 								    words = ["<", ">", "&", '"']
 								    doc = Doc(de_vocab, words=words, deps=["dep"] * len(words))
 								    html = displacy.render(doc)
 								    for char in chars:
 								        assert char in html
 								@pytest.mark.issue(2728)
 								def test_issue2728(en_vocab):
 								    """Test that displaCy ENT visualizer escapes HTML correctly."""
 								    doc = Doc(en_vocab, words=["test", "<RELEASE>", "test"])
 								    doc.ents = [Span(doc, 0, 1, label="TEST")]
 								    html = displacy.render(doc, style="ent")
 								    assert "&lt;RELEASE&gt;" in html
 								    doc.ents = [Span(doc, 1, 2, label="TEST")]
 								    html = displacy.render(doc, style="ent")
 								    assert "&lt;RELEASE&gt;" in html
 								@pytest.mark.issue(3288)
 								def test_issue3288(en_vocab):
 								    """Test that retokenization works correctly via displaCy when punctuation
 								    is merged onto the preceeding token and tensor is resized."""
 								    words = ["Hello", "World", "!", "When", "is", "this", "breaking", "?"]
 								    heads = [1, 1, 1, 4, 4, 6, 4, 4]
 								    deps = ["intj", "ROOT", "punct", "advmod", "ROOT", "det", "nsubj", "punct"]
 								    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
 								    doc.tensor = numpy.zeros((len(words), 96), dtype="float32")
 								    displacy.render(doc)
 								@pytest.mark.issue(3531)
 								def test_issue3531():
 								    """Test that displaCy renderer doesn't require "settings" key."""
 								    example_dep = {
 								        "words": [
 								            {"text": "But", "tag": "CCONJ"},
 								            {"text": "Google", "tag": "PROPN"},
 								            {"text": "is", "tag": "VERB"},
 								            {"text": "starting", "tag": "VERB"},
 								            {"text": "from", "tag": "ADP"},
 								            {"text": "behind.", "tag": "ADV"},
 								        ],
 								        "arcs": [
 								            {"start": 0, "end": 3, "label": "cc", "dir": "left"},
 								            {"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
 								            {"start": 2, "end": 3, "label": "aux", "dir": "left"},
 								            {"start": 3, "end": 4, "label": "prep", "dir": "right"},
 								            {"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
 								        ],
 								    }
 								    example_ent = {
 								        "text": "But Google is starting from behind.",
 								        "ents": [{"start": 4, "end": 10, "label": "ORG"}],
 								    }
 								    dep_html = displacy.render(example_dep, style="dep", manual=True)
 								    assert dep_html
 								    ent_html = displacy.render(example_ent, style="ent", manual=True)
 								    assert ent_html
 								@pytest.mark.issue(3882)
 								def test_issue3882(en_vocab):
 								    """Test that displaCy doesn't serialize the doc.user_data when making a
 								    copy of the Doc.
 								    """
 								    doc = Doc(en_vocab, words=["Hello", "world"], deps=["dep", "dep"])
 								    doc.user_data["test"] = set()
 								    displacy.parse_deps(doc)
-												displaCy: Avoid increasing levels for identical arcs (#10639)

* Test for arc levels for identical arcs

Also moves the test in order with the other numbered tests.

* displaCy: filter identical arcs

Avoid increased levels due to identical arcs by first
filtering any identical arcs.

* Sort keys before filtering

Manual entry with keys out of order would previously become
different tuples and therefore not filtered correctly.

Co-authored-by: Joachim Fainberg <joachimfainberg@Joachims-MBP.lan>
											
										
										
											2022-04-14 17:48:00 +03:00
+								@pytest.mark.issue(5447)
 								def test_issue5447():
 								    """Test that overlapping arcs get separate levels, unless they're identical."""
 								    renderer = DependencyRenderer()
 								    words = [
 								        {"text": "This", "tag": "DT"},
 								        {"text": "is", "tag": "VBZ"},
 								        {"text": "a", "tag": "DT"},
 								        {"text": "sentence.", "tag": "NN"},
 								    ]
 								    arcs = [
 								        {"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
 								        {"start": 2, "end": 3, "label": "det", "dir": "left"},
 								        {"start": 2, "end": 3, "label": "overlap", "dir": "left"},
 								        {"end": 3, "label": "overlap", "start": 2, "dir": "left"},
 								        {"start": 1, "end": 3, "label": "attr", "dir": "left"},
 								    ]
 								    renderer.render([{"words": words, "arcs": arcs}])
 								    assert renderer.highest_level == 3
-												Migrate regression tests into the main test suite (#9655)

* Migrate regressions 1-1000

* Move serialize test to correct file

* Remove tests that won't work in v3

* Migrate regressions 1000-1500

Removed regression test 1250 because v3 doesn't support the old LEX
scheme anymore.

* Add missing imports in serializer tests

* Migrate tests 1500-2000

* Migrate regressions from 2000-2500

* Migrate regressions from 2501-3000

* Migrate regressions from 3000-3501

* Migrate regressions from 3501-4000

* Migrate regressions from 4001-4500

* Migrate regressions from 4501-5000

* Migrate regressions from 5001-5501

* Migrate regressions from 5501 to 7000

* Migrate regressions from 7001 to 8000

* Migrate remaining regression tests

* Fixing missing imports

* Update docs with new system [ci skip]

* Update CONTRIBUTING.md

- Fix formatting
- Update wording

* Remove lemmatizer tests in el lang

* Move a few tests into the general tokenizer

* Separate Doc and DocBin tests
											
										
										
											2021-12-04 22:34:48 +03:00
+								@pytest.mark.issue(5838)
 								def test_issue5838():
 								    # Displacy's EntityRenderer break line
 								    # not working after last entity
 								    sample_text = "First line\nSecond line, with ent\nThird line\nFourth line\n"
 								    nlp = English()
 								    doc = nlp(sample_text)
 								    doc.ents = [Span(doc, 7, 8, label="test")]
 								    html = displacy.render(doc, style="ent")
 								    found = html.count("</br>")
 								    assert found == 4
-												Add displacy support for overlapping Spans (#10332)

* Fix docstring for EntityRenderer

* Add warning in displacy if doc.spans are empty

* Implement parse_spans converter

One notable change here is that the default spans_key is sc, and
it's set by the user through the options.

* Implement SpanRenderer

Here, I implemented a SpanRenderer that looks similar to the
EntityRenderer except for some templates.  The spans_key, by default, is
set to sc, but can be configured in the options (see parse_spans). The
way I rendered these spans is per-token, i.e., I first check if each
token (1) belongs to a given span type and (2) a starting token of a
given span type. Once I have this information, I render them into the
markup.

* Fix mypy issues on typing

* Add tests for displacy spans support

* Update colors from RGB to hex

Co-authored-by: Ines Montani <ines@ines.io>

* Remove unnecessary CSS properties

* Add documentation for website

* Remove unnecesasry scripts

* Update wording on the documentation

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Put typing dependency on top of file

* Put back z-index so that spans overlap properly

* Make warning more explicit for spans_key

Co-authored-by: Ines Montani <ines@ines.io>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
											
										
										
											2022-03-16 20:14:34 +03:00
+								def test_displacy_parse_spans(en_vocab):
 								    """Test that spans on a Doc are converted into displaCy's format."""
 								    doc = Doc(en_vocab, words=["Welcome", "to", "the", "Bank", "of", "China"])
 								    doc.spans["sc"] = [Span(doc, 3, 6, "ORG"), Span(doc, 5, 6, "GPE")]
 								    spans = displacy.parse_spans(doc)
 								    assert isinstance(spans, dict)
 								    assert spans["text"] == "Welcome to the Bank of China "
 								    assert spans["spans"] == [
 								        {
 								            "start": 15,
 								            "end": 28,
 								            "start_token": 3,
 								            "end_token": 6,
 								            "label": "ORG",
 								            "kb_id": "",
 								            "kb_url": "#",
 								        },
 								        {
 								            "start": 23,
 								            "end": 28,
 								            "start_token": 5,
 								            "end_token": 6,
 								            "label": "GPE",
 								            "kb_id": "",
 								            "kb_url": "#",
 								        },
 								    ]
 								def test_displacy_parse_spans_with_kb_id_options(en_vocab):
 								    """Test that spans with kb_id on a Doc are converted into displaCy's format"""
 								    doc = Doc(en_vocab, words=["Welcome", "to", "the", "Bank", "of", "China"])
 								    doc.spans["sc"] = [
 								        Span(doc, 3, 6, "ORG", kb_id="Q790068"),
 								        Span(doc, 5, 6, "GPE", kb_id="Q148"),
 								    ]
 								    spans = displacy.parse_spans(
 								        doc, {"kb_url_template": "https://wikidata.org/wiki/{}"}
 								    )
 								    assert isinstance(spans, dict)
 								    assert spans["text"] == "Welcome to the Bank of China "
 								    assert spans["spans"] == [
 								        {
 								            "start": 15,
 								            "end": 28,
 								            "start_token": 3,
 								            "end_token": 6,
 								            "label": "ORG",
 								            "kb_id": "Q790068",
 								            "kb_url": "https://wikidata.org/wiki/Q790068",
 								        },
 								        {
 								            "start": 23,
 								            "end": 28,
 								            "start_token": 5,
 								            "end_token": 6,
 								            "label": "GPE",
 								            "kb_id": "Q148",
 								            "kb_url": "https://wikidata.org/wiki/Q148",
 								        },
 								    ]
 								def test_displacy_parse_spans_different_spans_key(en_vocab):
 								    """Test that spans in a different spans key will be parsed"""
 								    doc = Doc(en_vocab, words=["Welcome", "to", "the", "Bank", "of", "China"])
 								    doc.spans["sc"] = [Span(doc, 3, 6, "ORG"), Span(doc, 5, 6, "GPE")]
 								    doc.spans["custom"] = [Span(doc, 3, 6, "BANK")]
 								    spans = displacy.parse_spans(doc, options={"spans_key": "custom"})
 								    assert isinstance(spans, dict)
 								    assert spans["text"] == "Welcome to the Bank of China "
 								    assert spans["spans"] == [
 								        {
 								            "start": 15,
 								            "end": 28,
 								            "start_token": 3,
 								            "end_token": 6,
 								            "label": "BANK",
 								            "kb_id": "",
 								            "kb_url": "#",
 								        }
 								    ]
-												Move displaCy tests to own file

											
										
										
											2019-03-11 17:28:34 +03:00
+								def test_displacy_parse_ents(en_vocab):
 								    """Test that named entities on a Doc are converted into displaCy's format."""
-												Tidy up tests and docs

											
										
										
											2020-09-21 21:43:54 +03:00
+								    doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
-												Move displaCy tests to own file

											
										
										
											2019-03-11 17:28:34 +03:00
+								    doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
 								    ents = displacy.parse_ents(doc)
 								    assert isinstance(ents, dict)
 								    assert ents["text"] == "But Google is starting from behind "
-												Displacy serve entity linking support without `manual=True` support. (#9748)

* Add support for kb_id to be displayed via displacy.serve. The current support is only limited to the manual option in displacy.render

* Commit to check pre-commit hooks are run.

* Update spacy/displacy/__init__.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Changes as per suggestions on the PR.

* Update website/docs/api/top-level.md

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update website/docs/api/top-level.md

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* tag option as new from 3.2.1 onwards

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
											
										
										
											2021-11-29 19:13:26 +03:00
+								    assert ents["ents"] == [
 								        {"start": 4, "end": 10, "label": "ORG", "kb_id": "", "kb_url": "#"}
 								    ]
 								    doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"], kb_id="Q95")]
 								    ents = displacy.parse_ents(doc)
 								    assert isinstance(ents, dict)
 								    assert ents["text"] == "But Google is starting from behind "
 								    assert ents["ents"] == [
 								        {"start": 4, "end": 10, "label": "ORG", "kb_id": "Q95", "kb_url": "#"}
 								    ]
 								def test_displacy_parse_ents_with_kb_id_options(en_vocab):
 								    """Test that named entities with kb_id on a Doc are converted into displaCy's format."""
 								    doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
 								    doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"], kb_id="Q95")]
 								    ents = displacy.parse_ents(
 								        doc, {"kb_url_template": "https://www.wikidata.org/wiki/{}"}
 								    )
 								    assert isinstance(ents, dict)
 								    assert ents["text"] == "But Google is starting from behind "
 								    assert ents["ents"] == [
 								        {
 								            "start": 4,
 								            "end": 10,
 								            "label": "ORG",
 								            "kb_id": "Q95",
 								            "kb_url": "https://www.wikidata.org/wiki/Q95",
 								        }
 								    ]
-												Move displaCy tests to own file

											
										
										
											2019-03-11 17:28:34 +03:00
 								def test_displacy_parse_deps(en_vocab):
 								    """Test that deps and tags on a Doc are converted into displaCy's format."""
 								    words = ["This", "is", "a", "sentence"]
-												Tidy up tests and docs

											
										
										
											2020-09-21 21:43:54 +03:00
+								    heads = [1, 1, 3, 1]
-												Move displaCy tests to own file

											
										
										
											2019-03-11 17:28:34 +03:00
+								    pos = ["DET", "VERB", "DET", "NOUN"]
 								    tags = ["DT", "VBZ", "DT", "NN"]
 								    deps = ["nsubj", "ROOT", "det", "attr"]
-												Tidy up tests and docs

											
										
										
											2020-09-21 21:43:54 +03:00
+								    doc = Doc(en_vocab, words=words, heads=heads, pos=pos, tags=tags, deps=deps)
-												Move displaCy tests to own file

											
										
										
											2019-03-11 17:28:34 +03:00
+								    deps = displacy.parse_deps(doc)
 								    assert isinstance(deps, dict)
 								    assert deps["words"] == [
-												Bugfix/get doc (#5049)

* new (broken) unit test

* fixing get_doc method

											
										
										
											2020-03-02 13:49:28 +03:00
+								        {"lemma": None, "text": words[0], "tag": pos[0]},
 								        {"lemma": None, "text": words[1], "tag": pos[1]},
 								        {"lemma": None, "text": words[2], "tag": pos[2]},
 								        {"lemma": None, "text": words[3], "tag": pos[3]},
-												Move displaCy tests to own file

											
										
										
											2019-03-11 17:28:34 +03:00
+								    ]
 								    assert deps["arcs"] == [
 								        {"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
 								        {"start": 2, "end": 3, "label": "det", "dir": "left"},
 								        {"start": 1, "end": 3, "label": "attr", "dir": "right"},
 								    ]
-												Raise error for negative arc indices (closes #3917)

											
										
										
											2019-08-20 16:51:37 +03:00
+								def test_displacy_invalid_arcs():
 								    renderer = DependencyRenderer()
 								    words = [{"text": "This", "tag": "DET"}, {"text": "is", "tag": "VERB"}]
 								    arcs = [
 								        {"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
 								        {"start": -1, "end": 2, "label": "det", "dir": "left"},
 								    ]
 								    with pytest.raises(ValueError):
 								        renderer.render([{"words": words, "arcs": arcs}])
-												Move displaCy tests to own file

											
										
										
											2019-03-11 17:28:34 +03:00
+								def test_displacy_spans(en_vocab):
 								    """Test that displaCy can render Spans."""
-												Tidy up tests and docs

											
										
										
											2020-09-21 21:43:54 +03:00
+								    doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
-												Move displaCy tests to own file

											
										
										
											2019-03-11 17:28:34 +03:00
+								    doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
 								    html = displacy.render(doc[1:4], style="ent")
 								    assert html.startswith("<div")
 								def test_displacy_raises_for_wrong_type(en_vocab):
 								    with pytest.raises(ValueError):
 								        displacy.render("hello world")
-												💫 Fix displaCy support for RTL languages (#3393)

Closes #2091.

## Description

With the new `vocab.writing_system` property introduced in #3390 (exposed via the language defaults), I was able to finally fix this (I think!). Based on the `Doc`, dispaCy now detects whether it's a RTL or LTR language and adjusts the visualization accordingly. Wherever possible, I've also added `direction` and `lang` attributes.

Entity visualization now looks like this:

<img width="318" alt="Screenshot 2019-03-11 at 16 06 51" src="https://user-images.githubusercontent.com/13643239/54136866-d97afd80-441c-11e9-8c27-3d46994cc833.png">

And dependencies like this (ignore the most likely incorrect tags and dependencies):

<img width="621" alt="Screenshot 2019-03-11 at 16 51 59" src="https://user-images.githubusercontent.com/13643239/54137771-8b66f980-441e-11e9-8460-0682b95eef2a.png">

### Types of change
enhancement, bug fix

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-11 20:52:50 +03:00
 								def test_displacy_rtl():
 								    # Source: http://www.sobhe.ir/hazm/ – is this correct?
 								    words = ["ما", "بسیار", "کتاب", "می\u200cخوانیم"]
 								    # These are (likely) wrong, but it's just for testing
 								    pos = ["PRO", "ADV", "N_PL", "V_SUB"]  # needs to match lang.fa.tag_map
 								    deps = ["foo", "bar", "foo", "baz"]
-												Tidy up tests and docs

											
										
										
											2020-09-21 21:43:54 +03:00
+								    heads = [1, 0, 3, 1]
-												💫 Fix displaCy support for RTL languages (#3393)

Closes #2091.

## Description

With the new `vocab.writing_system` property introduced in #3390 (exposed via the language defaults), I was able to finally fix this (I think!). Based on the `Doc`, dispaCy now detects whether it's a RTL or LTR language and adjusts the visualization accordingly. Wherever possible, I've also added `direction` and `lang` attributes.

Entity visualization now looks like this:

<img width="318" alt="Screenshot 2019-03-11 at 16 06 51" src="https://user-images.githubusercontent.com/13643239/54136866-d97afd80-441c-11e9-8c27-3d46994cc833.png">

And dependencies like this (ignore the most likely incorrect tags and dependencies):

<img width="621" alt="Screenshot 2019-03-11 at 16 51 59" src="https://user-images.githubusercontent.com/13643239/54137771-8b66f980-441e-11e9-8460-0682b95eef2a.png">

### Types of change
enhancement, bug fix

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-11 20:52:50 +03:00
+								    nlp = Persian()
-												Tidy up tests and docs

											
										
										
											2020-09-21 21:43:54 +03:00
+								    doc = Doc(nlp.vocab, words=words, tags=pos, heads=heads, deps=deps)
-												💫 Fix displaCy support for RTL languages (#3393)

Closes #2091.

## Description

With the new `vocab.writing_system` property introduced in #3390 (exposed via the language defaults), I was able to finally fix this (I think!). Based on the `Doc`, dispaCy now detects whether it's a RTL or LTR language and adjusts the visualization accordingly. Wherever possible, I've also added `direction` and `lang` attributes.

Entity visualization now looks like this:

<img width="318" alt="Screenshot 2019-03-11 at 16 06 51" src="https://user-images.githubusercontent.com/13643239/54136866-d97afd80-441c-11e9-8c27-3d46994cc833.png">

And dependencies like this (ignore the most likely incorrect tags and dependencies):

<img width="621" alt="Screenshot 2019-03-11 at 16 51 59" src="https://user-images.githubusercontent.com/13643239/54137771-8b66f980-441e-11e9-8460-0682b95eef2a.png">

### Types of change
enhancement, bug fix

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-11 20:52:50 +03:00
+								    doc.ents = [Span(doc, 1, 3, label="TEST")]
 								    html = displacy.render(doc, page=True, style="dep")
 								    assert "direction: rtl" in html
 								    assert 'direction="rtl"' in html
-												Drop Python 2.7 and 3.5 (#4828)

* Remove unicode declarations

* Remove Python 3.5 and 2.7 from CI

* Don't require pathlib

* Replace compat helpers

* Remove OrderedDict

* Use f-strings

* Set Cython compiler language level

* Fix typo

* Re-add OrderedDict for Table

* Update setup.cfg

* Revert CONTRIBUTING.md

* Revert lookups.md

* Revert top-level.md

* Small adjustments and docs [ci skip]

											
										
										
											2019-12-22 03:53:56 +03:00
+								    assert f'lang="{nlp.lang}"' in html
-												💫 Fix displaCy support for RTL languages (#3393)

Closes #2091.

## Description

With the new `vocab.writing_system` property introduced in #3390 (exposed via the language defaults), I was able to finally fix this (I think!). Based on the `Doc`, dispaCy now detects whether it's a RTL or LTR language and adjusts the visualization accordingly. Wherever possible, I've also added `direction` and `lang` attributes.

Entity visualization now looks like this:

<img width="318" alt="Screenshot 2019-03-11 at 16 06 51" src="https://user-images.githubusercontent.com/13643239/54136866-d97afd80-441c-11e9-8c27-3d46994cc833.png">

And dependencies like this (ignore the most likely incorrect tags and dependencies):

<img width="621" alt="Screenshot 2019-03-11 at 16 51 59" src="https://user-images.githubusercontent.com/13643239/54137771-8b66f980-441e-11e9-8460-0682b95eef2a.png">

### Types of change
enhancement, bug fix

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-11 20:52:50 +03:00
+								    html = displacy.render(doc, page=True, style="ent")
 								    assert "direction: rtl" in html
-												Drop Python 2.7 and 3.5 (#4828)

* Remove unicode declarations

* Remove Python 3.5 and 2.7 from CI

* Don't require pathlib

* Replace compat helpers

* Remove OrderedDict

* Use f-strings

* Set Cython compiler language level

* Fix typo

* Re-add OrderedDict for Table

* Update setup.cfg

* Revert CONTRIBUTING.md

* Revert lookups.md

* Revert top-level.md

* Small adjustments and docs [ci skip]

											
										
										
											2019-12-22 03:53:56 +03:00
+								    assert f'lang="{nlp.lang}"' in html
-												Update test_displacy.py

											
										
										
											2019-03-11 21:03:52 +03:00
-												Auto-format [ci skip]

											
										
										
											2019-03-12 15:35:34 +03:00
-												Update test_displacy.py

											
										
										
											2019-03-11 21:03:52 +03:00
+								def test_displacy_render_wrapper(en_vocab):
 								    """Test that displaCy accepts custom rendering wrapper."""
 								    def wrapper(html):
 								        return "TEST" + html + "TEST"
 								    displacy.set_render_wrapper(wrapper)
-												Tidy up tests and docs

											
										
										
											2020-09-21 21:43:54 +03:00
+								    doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
-												Update test_displacy.py

											
										
										
											2019-03-11 21:03:52 +03:00
+								    doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
 								    html = displacy.render(doc, style="ent")
 								    assert html.startswith("TEST<div")
 								    assert html.endswith("/div>TEST")
-												Auto-format [ci skip]

											
										
										
											2019-03-12 15:35:34 +03:00
+								    # Restore
 								    displacy.set_render_wrapper(lambda html: html)
-												Adjust label casing in displaCy NER visualizer (resolves #4866)

- Accept any case for label names in ents and colors option, even if actual predicted label uses different casing
- Don't text-transform: uppercase visually, if it's important to users that the label is represented as-is in the UI

											
										
										
											2020-08-21 12:51:31 +03:00
 								def test_displacy_options_case():
 								    ents = ["foo", "BAR"]
 								    colors = {"FOO": "red", "bar": "green"}
 								    renderer = EntityRenderer({"ents": ents, "colors": colors})
 								    text = "abcd"
 								    labels = ["foo", "bar", "FOO", "BAR"]
 								    spans = [{"start": i, "end": i + 1, "label": labels[i]} for i in range(len(text))]
 								    result = renderer.render_ents("abcde", spans, None).split("\n\n")
 								    assert "red" in result[0] and "foo" in result[0]
 								    assert "green" in result[1] and "bar" in result[1]
 								    assert "red" in result[2] and "FOO" in result[2]
 								    assert "green" in result[3] and "BAR" in result[3]
-												#10672: fixes displacy output for manual unsorted entities (#10673)

* #10672: fixes displacy output for manual unsorted entities

* #10672: removed unused import

* fix prettier formatting

Co-authored-by: Harm Buisman <h.buisman@iknl.nl>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
											
										
										
											2022-04-27 10:51:58 +03:00
 								@pytest.mark.issue(10672)
 								def test_displacy_manual_sorted_entities():
 								    doc = {
 								        "text": "But Google is starting from behind.",
 								        "ents": [
 								            {"start": 14, "end": 22, "label": "SECOND"},
 								            {"start": 4, "end": 10, "label": "FIRST"},
 								        ],
 								        "title": None,
 								    }
 								    html = displacy.render(doc, style="ent", manual=True)
 								    assert html.find("FIRST") < html.find("SECOND")