mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-06 05:10:21 +03:00
add underscore changes and test to span.to_doc
This commit is contained in:
parent
53dc321bb9
commit
913992aa3d
|
@ -350,7 +350,7 @@ def test_underscore_for_unique_span(en_tokenizer):
|
|||
assert doc.user_data[("._.", "token_extension", 0, None)] == "token extension"
|
||||
|
||||
def test_underscore_for_unique_span_from_docs(en_tokenizer):
|
||||
"""Test that spans in the user_data keep the same data structure"""
|
||||
"""Test that spans in the user_data keep the same data structure when using Doc.from_docs"""
|
||||
Span.set_extension(name="span_extension", default=None)
|
||||
Token.set_extension(name="token_extension", default=None)
|
||||
|
||||
|
@ -462,4 +462,52 @@ def test_underscore_for_unique_span_from_docs(en_tokenizer):
|
|||
)
|
||||
]
|
||||
== "span_2a extension"
|
||||
)
|
||||
|
||||
def test_underscore_for_unique_span_as_span(en_tokenizer):
|
||||
"""Test that spans in the user_data keep the same data structure when using Span.as_doc"""
|
||||
Span.set_extension(name="span_extension", default=None)
|
||||
|
||||
# Initialize doc
|
||||
text = "Hello, world!"
|
||||
doc = en_tokenizer(text)
|
||||
span_1 = Span(doc, 0, 2, "SPAN_1")
|
||||
span_2 = Span(doc, 0, 2, "SPAN_2")
|
||||
|
||||
# Set custom extensions
|
||||
span_1._.span_extension = "span_1 extension"
|
||||
span_2._.span_extension = "span_2 extension"
|
||||
|
||||
span_doc = span_1.as_doc(copy_user_data=True)
|
||||
|
||||
# Assert extensions
|
||||
assert (
|
||||
span_doc.user_data[
|
||||
(
|
||||
"._.",
|
||||
"span_extension",
|
||||
span_1.start_char,
|
||||
span_1.end_char,
|
||||
span_1.label,
|
||||
span_1.kb_id,
|
||||
span_1.id,
|
||||
)
|
||||
]
|
||||
== "span_1 extension"
|
||||
)
|
||||
|
||||
# Assert extensions
|
||||
assert (
|
||||
span_doc.user_data[
|
||||
(
|
||||
"._.",
|
||||
"span_extension",
|
||||
span_2.start_char,
|
||||
span_2.end_char,
|
||||
span_2.label,
|
||||
span_2.kb_id,
|
||||
span_2.id,
|
||||
)
|
||||
]
|
||||
== "span_2 extension"
|
||||
)
|
|
@ -274,12 +274,20 @@ cdef class Span:
|
|||
char_offset = self.start_char
|
||||
for key, value in self.doc.user_data.items():
|
||||
if isinstance(key, tuple) and len(key) == 4 and key[0] == "._.":
|
||||
data_type, name, start, end = key
|
||||
data_type = key[0]
|
||||
name = key[1]
|
||||
start = key[2]
|
||||
end = key[3]
|
||||
if start is not None or end is not None:
|
||||
start -= char_offset
|
||||
if end is not None:
|
||||
end -= char_offset
|
||||
user_data[(data_type, name, start, end)] = copy.copy(value)
|
||||
_label = key[4]
|
||||
_kb_id = key[5]
|
||||
_span_id = key[6]
|
||||
user_data[(data_type, name, start, end, _label, _kb_id, _span_id)] = copy.copy(value)
|
||||
else:
|
||||
user_data[(data_type, name, start, end)] = copy.copy(value)
|
||||
else:
|
||||
user_data[key] = copy.copy(value)
|
||||
doc.user_data = user_data
|
||||
|
|
Loading…
Reference in New Issue
Block a user