additional information if doc is empty

This commit is contained in:
svlandeg 2020-03-09 18:08:18 +01:00
parent 1d6aec805d
commit 1724a4f75b
3 changed files with 13 additions and 6 deletions

View File

@ -5,7 +5,7 @@ import pytest
import re import re
from mock import Mock from mock import Mock
from spacy.matcher import Matcher, DependencyMatcher from spacy.matcher import Matcher, DependencyMatcher
from spacy.tokens import Doc, Token from spacy.tokens import Doc, Token, Span
from ..doc.test_underscore import clean_underscore from ..doc.test_underscore import clean_underscore
@ -458,3 +458,10 @@ def test_matcher_callback(en_vocab):
doc = Doc(en_vocab, words=["This", "is", "a", "test", "."]) doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
matches = matcher(doc) matches = matcher(doc)
mock.assert_called_once_with(matcher, doc, 0, matches) mock.assert_called_once_with(matcher, doc, 0, matches)
def test_matcher_span(matcher):
text = "JavaScript is good but Java is better"
doc = Doc(matcher.vocab, words=text.split())
span = Span(doc, 0, 3)
matches = matcher(span.as_doc())
assert len(matches) == 1

View File

@ -260,7 +260,7 @@ cdef class Doc:
def is_nered(self): def is_nered(self):
"""Check if the document has named entities set. Will return True if """Check if the document has named entities set. Will return True if
*any* of the tokens has a named entity tag set (even if the others are *any* of the tokens has a named entity tag set (even if the others are
unknown values). unknown values), or if the document is empty.
""" """
if len(self) == 0: if len(self) == 0:
return True return True

View File

@ -657,10 +657,10 @@ The L2 norm of the document's vector representation.
| `user_data` | - | A generic storage area, for user custom data. | | `user_data` | - | A generic storage area, for user custom data. |
| `lang` <Tag variant="new">2.1</Tag> | int | Language of the document's vocabulary. | | `lang` <Tag variant="new">2.1</Tag> | int | Language of the document's vocabulary. |
| `lang_` <Tag variant="new">2.1</Tag> | unicode | Language of the document's vocabulary. | | `lang_` <Tag variant="new">2.1</Tag> | unicode | Language of the document's vocabulary. |
| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. | | `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. |
| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. | | `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. |
| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. | | `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. |
| `is_nered` <Tag variant="new">2.1</Tag> | bool | A flag indicating that named entities have been set. Will return `True` if _any_ of the tokens has an entity tag set, even if the others are unknown. | | `is_nered` <Tag variant="new">2.1</Tag> | bool | A flag indicating that named entities have been set. Will return `True` if the `Doc` is empty, or if _any_ of the tokens has an entity tag set, even if the others are unknown. |
| `sentiment` | float | The document's positivity/negativity score, if available. | | `sentiment` | float | The document's positivity/negativity score, if available. |
| `user_hooks` | dict | A dictionary that allows customization of the `Doc`'s properties. | | `user_hooks` | dict | A dictionary that allows customization of the `Doc`'s properties. |
| `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. | | `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. |