mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 00:46:28 +03:00
💫 Improve Doc.to_json and add Doc.is_nered (#3381)
* Use default return instead of else * Add Doc.is_nered to indicate if entities have been set * Add properties in Doc.to_json if they were set, not if they're available This way, if a processed Doc exports "pos": None, it means that the tag was explicitly unset. If it exports "ents": [], it means that entity annotations are available but that this document doesn't contain any entities. Before, this would have been unclear and problematic for training.
This commit is contained in:
parent
7984543953
commit
0426689db8
|
@ -4,9 +4,10 @@ from __future__ import unicode_literals
|
|||
|
||||
import pytest
|
||||
import numpy
|
||||
from spacy.tokens import Doc
|
||||
from spacy.tokens import Doc, Span
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.errors import ModelsWarning
|
||||
from spacy.attrs import ENT_TYPE, ENT_IOB
|
||||
|
||||
from ..util import get_doc
|
||||
|
||||
|
@ -256,3 +257,18 @@ def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix):
|
|||
assert lca[1, 1] == 1
|
||||
assert lca[0, 1] == 2
|
||||
assert lca[1, 2] == 2
|
||||
|
||||
|
||||
def test_doc_is_nered(en_vocab):
|
||||
words = ["I", "live", "in", "New", "York"]
|
||||
doc = Doc(en_vocab, words=words)
|
||||
assert not doc.is_nered
|
||||
doc.ents = [Span(doc, 3, 5, label="GPE")]
|
||||
assert doc.is_nered
|
||||
# Test creating doc from array with unknown values
|
||||
arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64")
|
||||
doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr)
|
||||
assert doc.is_nered
|
||||
# Test serialization
|
||||
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
|
||||
assert new_doc.is_nered
|
||||
|
|
|
@ -240,8 +240,18 @@ cdef class Doc:
|
|||
for i in range(1, self.length):
|
||||
if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
return False
|
||||
|
||||
@property
|
||||
def is_nered(self):
|
||||
"""Check if the document has named entities set. Will return True if
|
||||
*any* of the tokens has a named entity tag set (even if the others are
|
||||
uknown values).
|
||||
"""
|
||||
for i in range(self.length):
|
||||
if self.c[i].ent_iob != 0:
|
||||
return True
|
||||
return False
|
||||
|
||||
def __getitem__(self, object i):
|
||||
"""Get a `Token` or `Span` object.
|
||||
|
@ -990,11 +1000,11 @@ cdef class Doc:
|
|||
DOCS: https://spacy.io/api/doc#to_json
|
||||
"""
|
||||
data = {"text": self.text}
|
||||
if self.ents:
|
||||
if self.is_nered:
|
||||
data["ents"] = [{"start": ent.start_char, "end": ent.end_char,
|
||||
"label": ent.label_} for ent in self.ents]
|
||||
sents = list(self.sents)
|
||||
if sents:
|
||||
if self.is_sentenced:
|
||||
sents = list(self.sents)
|
||||
data["sents"] = [{"start": sent.start_char, "end": sent.end_char}
|
||||
for sent in sents]
|
||||
if self.cats:
|
||||
|
@ -1002,13 +1012,11 @@ cdef class Doc:
|
|||
data["tokens"] = []
|
||||
for token in self:
|
||||
token_data = {"id": token.i, "start": token.idx, "end": token.idx + len(token)}
|
||||
if token.pos_:
|
||||
if self.is_tagged:
|
||||
token_data["pos"] = token.pos_
|
||||
if token.tag_:
|
||||
token_data["tag"] = token.tag_
|
||||
if token.dep_:
|
||||
if self.is_parsed:
|
||||
token_data["dep"] = token.dep_
|
||||
if token.head:
|
||||
token_data["head"] = token.head.i
|
||||
data["tokens"].append(token_data)
|
||||
if underscore:
|
||||
|
|
|
@ -237,7 +237,7 @@ attribute ID.
|
|||
> from spacy.attrs import ORTH
|
||||
> doc = nlp(u"apple apple orange banana")
|
||||
> assert doc.count_by(ORTH) == {7024L: 1, 119552L: 1, 2087L: 2}
|
||||
> doc.to_array([attrs.ORTH])
|
||||
> doc.to_array([ORTH])
|
||||
> # array([[11880], [11880], [7561], [12800]])
|
||||
> ```
|
||||
|
||||
|
@ -640,20 +640,21 @@ The L2 norm of the document's vector representation.
|
|||
|
||||
## Attributes {#attributes}
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `text` | unicode | A unicode representation of the document text. |
|
||||
| `text_with_ws` | unicode | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. |
|
||||
| `mem` | `Pool` | The document's local memory heap, for all C data it owns. |
|
||||
| `vocab` | `Vocab` | The store of lexical types. |
|
||||
| `tensor` <Tag variant="new">2</Tag> | object | Container for dense vector representations. |
|
||||
| `cats` <Tag variant="new">2</Tag> | dictionary | Maps either a label to a score for categories applied to whole document, or `(start_char, end_char, label)` to score for categories applied to spans. `start_char` and `end_char` should be character offsets, label can be either a string or an integer ID, and score should be a float. |
|
||||
| `user_data` | - | A generic storage area, for user custom data. |
|
||||
| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. |
|
||||
| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. |
|
||||
| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. |
|
||||
| `sentiment` | float | The document's positivity/negativity score, if available. |
|
||||
| `user_hooks` | dict | A dictionary that allows customization of the `Doc`'s properties. |
|
||||
| `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. |
|
||||
| `user_span_hooks` | dict | A dictionary that allows customization of properties of `Span` children. |
|
||||
| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). |
|
||||
| Name | Type | Description |
|
||||
| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `text` | unicode | A unicode representation of the document text. |
|
||||
| `text_with_ws` | unicode | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. |
|
||||
| `mem` | `Pool` | The document's local memory heap, for all C data it owns. |
|
||||
| `vocab` | `Vocab` | The store of lexical types. |
|
||||
| `tensor` <Tag variant="new">2</Tag> | object | Container for dense vector representations. |
|
||||
| `cats` <Tag variant="new">2</Tag> | dictionary | Maps either a label to a score for categories applied to whole document, or `(start_char, end_char, label)` to score for categories applied to spans. `start_char` and `end_char` should be character offsets, label can be either a string or an integer ID, and score should be a float. |
|
||||
| `user_data` | - | A generic storage area, for user custom data. |
|
||||
| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. |
|
||||
| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. |
|
||||
| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. |
|
||||
| `is_nered` <Tag variant="new">2.1</Tag> | bool | A flag indicating that named entities have been set. Will return `True` if _any_ of the tokens has an entity tag set, even if the others are unknown. |
|
||||
| `sentiment` | float | The document's positivity/negativity score, if available. |
|
||||
| `user_hooks` | dict | A dictionary that allows customization of the `Doc`'s properties. |
|
||||
| `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. |
|
||||
| `user_span_hooks` | dict | A dictionary that allows customization of properties of `Span` children. |
|
||||
| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). |
|
||||
|
|
Loading…
Reference in New Issue
Block a user