From 913992aa3de6f1aa22cb5f31f6a856f791a3b87a Mon Sep 17 00:00:00 2001 From: thomashacker Date: Fri, 4 Nov 2022 17:21:14 +0100 Subject: [PATCH] add underscore changes and test to span.to_doc --- spacy/tests/doc/test_underscore.py | 50 +++++++++++++++++++++++++++++- spacy/tokens/span.pyx | 12 +++++-- 2 files changed, 59 insertions(+), 3 deletions(-) diff --git a/spacy/tests/doc/test_underscore.py b/spacy/tests/doc/test_underscore.py index 96ddb730a..82b53220e 100644 --- a/spacy/tests/doc/test_underscore.py +++ b/spacy/tests/doc/test_underscore.py @@ -350,7 +350,7 @@ def test_underscore_for_unique_span(en_tokenizer): assert doc.user_data[("._.", "token_extension", 0, None)] == "token extension" def test_underscore_for_unique_span_from_docs(en_tokenizer): - """Test that spans in the user_data keep the same data structure""" + """Test that spans in the user_data keep the same data structure when using Doc.from_docs""" Span.set_extension(name="span_extension", default=None) Token.set_extension(name="token_extension", default=None) @@ -462,4 +462,52 @@ def test_underscore_for_unique_span_from_docs(en_tokenizer): ) ] == "span_2a extension" + ) + +def test_underscore_for_unique_span_as_span(en_tokenizer): + """Test that spans in the user_data keep the same data structure when using Span.as_doc""" + Span.set_extension(name="span_extension", default=None) + + # Initialize doc + text = "Hello, world!" + doc = en_tokenizer(text) + span_1 = Span(doc, 0, 2, "SPAN_1") + span_2 = Span(doc, 0, 2, "SPAN_2") + + # Set custom extensions + span_1._.span_extension = "span_1 extension" + span_2._.span_extension = "span_2 extension" + + span_doc = span_1.as_doc(copy_user_data=True) + + # Assert extensions + assert ( + span_doc.user_data[ + ( + "._.", + "span_extension", + span_1.start_char, + span_1.end_char, + span_1.label, + span_1.kb_id, + span_1.id, + ) + ] + == "span_1 extension" + ) + + # Assert extensions + assert ( + span_doc.user_data[ + ( + "._.", + "span_extension", + span_2.start_char, + span_2.end_char, + span_2.label, + span_2.kb_id, + span_2.id, + ) + ] + == "span_2 extension" ) \ No newline at end of file diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index d853f6fef..0435f9260 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -274,12 +274,20 @@ cdef class Span: char_offset = self.start_char for key, value in self.doc.user_data.items(): if isinstance(key, tuple) and len(key) == 4 and key[0] == "._.": - data_type, name, start, end = key + data_type = key[0] + name = key[1] + start = key[2] + end = key[3] if start is not None or end is not None: start -= char_offset if end is not None: end -= char_offset - user_data[(data_type, name, start, end)] = copy.copy(value) + _label = key[4] + _kb_id = key[5] + _span_id = key[6] + user_data[(data_type, name, start, end, _label, _kb_id, _span_id)] = copy.copy(value) + else: + user_data[(data_type, name, start, end)] = copy.copy(value) else: user_data[key] = copy.copy(value) doc.user_data = user_data