diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 74d56e8d7..f9263176b 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1066,8 +1066,15 @@ cdef class DependencyParser(Parser): @property def labels(self): + labels = set() # Get the labels from the model by looking at the available moves - return tuple(set(move.split("-")[1] for move in self.move_names if "-" in move)) + for move in self.move_names: + if "-" in move: + label = move.split("-")[1] + if "||" in label: + label = label.split("||")[1] + labels.add(label) + return tuple(sorted(labels)) cdef class EntityRecognizer(Parser): @@ -1102,8 +1109,9 @@ cdef class EntityRecognizer(Parser): def labels(self): # Get the labels from the model by looking at the available moves, e.g. # B-PERSON, I-PERSON, L-PERSON, U-PERSON - return tuple(set(move.split("-")[1] for move in self.move_names - if move[0] in ("B", "I", "L", "U"))) + labels = set(move.split("-")[1] for move in self.move_names + if move[0] in ("B", "I", "L", "U")) + return tuple(sorted(labels)) class EntityLinker(Pipe): diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index 60b711741..c8c809d24 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -173,6 +173,21 @@ def test_span_as_doc(doc): assert span_doc[0].idx == 0 +def test_span_as_doc_user_data(doc): + """Test that the user_data can be preserved (but not by default). """ + my_key = "my_info" + my_value = 342 + doc.user_data[my_key] = my_value + + span = doc[4:10] + span_doc_with = span.as_doc(copy_user_data=True) + span_doc_without = span.as_doc() + + assert doc.user_data.get(my_key, None) is my_value + assert span_doc_with.user_data.get(my_key, None) is my_value + assert span_doc_without.user_data.get(my_key, None) is None + + def test_span_string_label_kb_id(doc): span = Span(doc, 0, 1, label="hello", kb_id="Q342") assert span.label_ == "hello" diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index f702133af..9e99392a9 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -200,13 +200,15 @@ cdef class Span: return Underscore(Underscore.span_extensions, self, start=self.start_char, end=self.end_char) - def as_doc(self): + def as_doc(self, bint copy_user_data=False): """Create a `Doc` object with a copy of the `Span`'s data. + copy_user_data (bool): Whether or not to copy the original doc's user data. RETURNS (Doc): The `Doc` copy of the span. DOCS: https://spacy.io/api/span#as_doc """ + # TODO: make copy_user_data a keyword-only argument (Python 3 only) words = [t.text for t in self] spaces = [bool(t.whitespace_) for t in self] cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces) @@ -235,6 +237,8 @@ cdef class Span: cat_start, cat_end, cat_label = key if cat_start == self.start_char and cat_end == self.end_char: doc.cats[cat_label] = value + if copy_user_data: + doc.user_data = self.doc.user_data return doc def _fix_dep_copy(self, attrs, array): diff --git a/website/docs/api/span.md b/website/docs/api/span.md index 7e3ce19d0..64b77b89d 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -293,9 +293,10 @@ Create a new `Doc` object corresponding to the `Span`, with a copy of the data. > assert doc2.text == "New York" > ``` -| Name | Type | Description | -| ----------- | ----- | --------------------------------------- | -| **RETURNS** | `Doc` | A `Doc` object of the `Span`'s content. | +| Name | Type | Description | +| ----------------- | ----- | ---------------------------------------------------- | +| `copy_user_data` | bool | Whether or not to copy the original doc's user data. | +| **RETURNS** | `Doc` | A `Doc` object of the `Span`'s content. | ## Span.root {#root tag="property" model="parser"}