Allow copying of user_data in as_doc (#4282)

* Allow copying the user_data with as_doc + unit test

* add option to docs

* add typing

* import fix

* workaround to avoid bool clashing ...

* bint instead of bool
This commit is contained in:
Sofie Van Landeghem 2019-09-12 17:08:14 +02:00 committed by Matthew Honnibal
parent 0760c41393
commit 9be4d1c105
3 changed files with 24 additions and 4 deletions

View File

@ -173,6 +173,21 @@ def test_span_as_doc(doc):
assert span_doc[0].idx == 0 assert span_doc[0].idx == 0
def test_span_as_doc_user_data(doc):
"""Test that the user_data can be preserved (but not by default). """
my_key = "my_info"
my_value = 342
doc.user_data[my_key] = my_value
span = doc[4:10]
span_doc_with = span.as_doc(copy_user_data=True)
span_doc_without = span.as_doc()
assert doc.user_data.get(my_key, None) is my_value
assert span_doc_with.user_data.get(my_key, None) is my_value
assert span_doc_without.user_data.get(my_key, None) is None
def test_span_string_label_kb_id(doc): def test_span_string_label_kb_id(doc):
span = Span(doc, 0, 1, label="hello", kb_id="Q342") span = Span(doc, 0, 1, label="hello", kb_id="Q342")
assert span.label_ == "hello" assert span.label_ == "hello"

View File

@ -200,13 +200,15 @@ cdef class Span:
return Underscore(Underscore.span_extensions, self, return Underscore(Underscore.span_extensions, self,
start=self.start_char, end=self.end_char) start=self.start_char, end=self.end_char)
def as_doc(self): def as_doc(self, bint copy_user_data=False):
"""Create a `Doc` object with a copy of the `Span`'s data. """Create a `Doc` object with a copy of the `Span`'s data.
copy_user_data (bool): Whether or not to copy the original doc's user data.
RETURNS (Doc): The `Doc` copy of the span. RETURNS (Doc): The `Doc` copy of the span.
DOCS: https://spacy.io/api/span#as_doc DOCS: https://spacy.io/api/span#as_doc
""" """
# TODO: make copy_user_data a keyword-only argument (Python 3 only)
words = [t.text for t in self] words = [t.text for t in self]
spaces = [bool(t.whitespace_) for t in self] spaces = [bool(t.whitespace_) for t in self]
cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces) cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces)
@ -235,6 +237,8 @@ cdef class Span:
cat_start, cat_end, cat_label = key cat_start, cat_end, cat_label = key
if cat_start == self.start_char and cat_end == self.end_char: if cat_start == self.start_char and cat_end == self.end_char:
doc.cats[cat_label] = value doc.cats[cat_label] = value
if copy_user_data:
doc.user_data = self.doc.user_data
return doc return doc
def _fix_dep_copy(self, attrs, array): def _fix_dep_copy(self, attrs, array):

View File

@ -292,9 +292,10 @@ Create a new `Doc` object corresponding to the `Span`, with a copy of the data.
> assert doc2.text == u"New York" > assert doc2.text == u"New York"
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ----- | --------------------------------------- | | ----------------- | ----- | ---------------------------------------------------- |
| **RETURNS** | `Doc` | A `Doc` object of the `Span`'s content. | | `copy_user_data` | bool | Whether or not to copy the original doc's user data. |
| **RETURNS** | `Doc` | A `Doc` object of the `Span`'s content. |
## Span.root {#root tag="property" model="parser"} ## Span.root {#root tag="property" model="parser"}