add helper function to improve readability

This commit is contained in:
thomashacker 2022-12-05 12:32:34 +01:00
parent 9a41081a26
commit b1ec51491c

View File

@ -3,6 +3,10 @@ from mock import Mock
from spacy.tokens import Doc, Span, Token from spacy.tokens import Doc, Span, Token
from spacy.tokens.underscore import Underscore from spacy.tokens.underscore import Underscore
# Helper functions
def _get_tuple(s: Span):
return "._.", "span_extension", s.start_char, s.end_char, s.label, s.kb_id, s.id
@pytest.fixture(scope="function", autouse=True) @pytest.fixture(scope="function", autouse=True)
def clean_underscore(): def clean_underscore():
@ -192,163 +196,34 @@ def test_underscore_for_unique_span(en_tokenizer):
span_2._.span_extension = "span_2 extension" span_2._.span_extension = "span_2 extension"
# Assert extensions # Assert extensions
assert ( assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
doc.user_data[ assert doc.user_data[_get_tuple(span_2)] == "span_2 extension"
(
"._.",
"span_extension",
span_1.start_char,
span_1.end_char,
span_1.label,
span_1.kb_id,
span_1.id,
)
]
== "span_1 extension"
)
assert (
doc.user_data[
(
"._.",
"span_extension",
span_2.start_char,
span_2.end_char,
span_2.label,
span_2.kb_id,
span_2.id,
)
]
== "span_2 extension"
)
# Change label of span and assert extensions # Change label of span and assert extensions
span_1.label_ = "NEW_LABEL" span_1.label_ = "NEW_LABEL"
assert ( assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
doc.user_data[ assert doc.user_data[_get_tuple(span_2)] == "span_2 extension"
(
"._.",
"span_extension",
span_1.start_char,
span_1.end_char,
span_1.label,
span_1.kb_id,
span_1.id,
)
]
== "span_1 extension"
)
assert (
doc.user_data[
(
"._.",
"span_extension",
span_2.start_char,
span_2.end_char,
span_2.label,
span_2.kb_id,
span_2.id,
)
]
== "span_2 extension"
)
# Change KB_ID and assert extensions # Change KB_ID and assert extensions
span_1.kb_id_ = "KB_ID" span_1.kb_id_ = "KB_ID"
assert ( assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
doc.user_data[ assert doc.user_data[_get_tuple(span_2)] == "span_2 extension"
(
"._.",
"span_extension",
span_1.start_char,
span_1.end_char,
span_1.label,
span_1.kb_id,
span_1.id,
)
]
== "span_1 extension"
)
assert (
doc.user_data[
(
"._.",
"span_extension",
span_2.start_char,
span_2.end_char,
span_2.label,
span_2.kb_id,
span_2.id,
)
]
== "span_2 extension"
)
# Change extensions and assert # Change extensions and assert
span_2._.span_extension = "updated span_2 extension" span_2._.span_extension = "updated span_2 extension"
assert ( assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
doc.user_data[ assert doc.user_data[_get_tuple(span_2)] == "updated span_2 extension"
(
"._.",
"span_extension",
span_1.start_char,
span_1.end_char,
span_1.label,
span_1.kb_id,
span_1.id,
)
]
== "span_1 extension"
)
assert (
doc.user_data[
(
"._.",
"span_extension",
span_2.start_char,
span_2.end_char,
span_2.label,
span_2.kb_id,
span_2.id,
)
]
== "updated span_2 extension"
)
# Change span ID and assert extensions # Change span ID and assert extensions
span_2.id = 2 span_2.id = 2
assert ( assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
doc.user_data[ assert doc.user_data[_get_tuple(span_2)] == "updated span_2 extension"
(
"._.",
"span_extension",
span_1.start_char,
span_1.end_char,
span_1.label,
span_1.kb_id,
span_1.id,
)
]
== "span_1 extension"
)
assert (
doc.user_data[
(
"._.",
"span_extension",
span_2.start_char,
span_2.end_char,
span_2.label,
span_2.kb_id,
span_2.id,
)
]
== "updated span_2 extension"
)
# Assert extensions with original key # Assert extensions with original key
assert doc.user_data[("._.", "doc_extension", None, None)] == "doc extension" assert doc.user_data[("._.", "doc_extension", None, None)] == "doc extension"
assert doc.user_data[("._.", "token_extension", 0, None)] == "token extension" assert doc.user_data[("._.", "token_extension", 0, None)] == "token extension"
def test_underscore_for_unique_span_from_docs(en_tokenizer): def test_underscore_for_unique_span_from_docs(en_tokenizer):
"""Test that spans in the user_data keep the same data structure when using Doc.from_docs""" """Test that spans in the user_data keep the same data structure when using Doc.from_docs"""
Span.set_extension(name="span_extension", default=None) Span.set_extension(name="span_extension", default=None)
@ -373,82 +248,13 @@ def test_underscore_for_unique_span_from_docs(en_tokenizer):
doc = Doc.from_docs([doc_1, doc_2]) doc = Doc.from_docs([doc_1, doc_2])
# Assert extensions # Assert extensions
assert ( assert doc_1.user_data[_get_tuple(span_1a)] == "span_1a extension"
doc_1.user_data[ assert doc_1.user_data[_get_tuple(span_1b)] == "span_1b extension"
( assert doc_2.user_data[_get_tuple(span_2a)] == "span_2a extension"
"._.",
"span_extension",
span_1a.start_char,
span_1a.end_char,
span_1a.label,
span_1a.kb_id,
span_1a.id,
)
]
== "span_1a extension"
)
assert (
doc_1.user_data[
(
"._.",
"span_extension",
span_1b.start_char,
span_1b.end_char,
span_1b.label,
span_1b.kb_id,
span_1b.id,
)
]
== "span_1b extension"
)
assert (
doc_2.user_data[
(
"._.",
"span_extension",
span_2a.start_char,
span_2a.end_char,
span_2a.label,
span_2a.kb_id,
span_2a.id,
)
]
== "span_2a extension"
)
# Check merged doc # Check merged doc
assert ( assert doc.user_data[_get_tuple(span_1a)] == "span_1a extension"
doc.user_data[ assert doc.user_data[_get_tuple(span_1b)] == "span_1b extension"
(
"._.",
"span_extension",
span_1a.start_char,
span_1a.end_char,
span_1a.label,
span_1a.kb_id,
span_1a.id,
)
]
== "span_1a extension"
)
assert (
doc.user_data[
(
"._.",
"span_extension",
span_1b.start_char,
span_1b.end_char,
span_1b.label,
span_1b.kb_id,
span_1b.id,
)
]
== "span_1b extension"
)
assert ( assert (
doc.user_data[ doc.user_data[
( (
@ -464,6 +270,7 @@ def test_underscore_for_unique_span_from_docs(en_tokenizer):
== "span_2a extension" == "span_2a extension"
) )
def test_underscore_for_unique_span_as_span(en_tokenizer): def test_underscore_for_unique_span_as_span(en_tokenizer):
"""Test that spans in the user_data keep the same data structure when using Span.as_doc""" """Test that spans in the user_data keep the same data structure when using Span.as_doc"""
Span.set_extension(name="span_extension", default=None) Span.set_extension(name="span_extension", default=None)
@ -481,33 +288,5 @@ def test_underscore_for_unique_span_as_span(en_tokenizer):
span_doc = span_1.as_doc(copy_user_data=True) span_doc = span_1.as_doc(copy_user_data=True)
# Assert extensions # Assert extensions
assert ( assert span_doc.user_data[_get_tuple(span_1)] == "span_1 extension"
span_doc.user_data[ assert span_doc.user_data[_get_tuple(span_2)] == "span_2 extension"
(
"._.",
"span_extension",
span_1.start_char,
span_1.end_char,
span_1.label,
span_1.kb_id,
span_1.id,
)
]
== "span_1 extension"
)
# Assert extensions
assert (
span_doc.user_data[
(
"._.",
"span_extension",
span_2.start_char,
span_2.end_char,
span_2.label,
span_2.kb_id,
span_2.id,
)
]
== "span_2 extension"
)