Token sent attributes more consistent (#10164)

* remove duplicate line

* add sent start/end token attributes to the docs

* let has_annotation work with IS_SENT_END

* elif instead of if

* add has_annotation test for sent attributes

* fix typo

* remove duplicate is_sent_start entry in docs
This commit is contained in:
Sofie Van Landeghem 2022-02-08 08:35:37 +01:00 committed by GitHub
parent 836f689cc7
commit deb143fa70
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 27 additions and 21 deletions

View File

@ -310,7 +310,6 @@ GLOSSARY = {
"re": "repeated element",
"rs": "reported speech",
"sb": "subject",
"sb": "subject",
"sbp": "passivized subject (PP)",
"sp": "subject or predicate",
"svp": "separable verb prefix",

View File

@ -684,6 +684,7 @@ def test_has_annotation(en_vocab):
attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "HEAD", "ENT_IOB", "ENT_TYPE")
for attr in attrs:
assert not doc.has_annotation(attr)
assert not doc.has_annotation(attr, require_complete=True)
doc[0].tag_ = "A"
doc[0].pos_ = "X"
@ -709,6 +710,27 @@ def test_has_annotation(en_vocab):
assert doc.has_annotation(attr, require_complete=True)
def test_has_annotation_sents(en_vocab):
doc = Doc(en_vocab, words=["Hello", "beautiful", "world"])
attrs = ("SENT_START", "IS_SENT_START", "IS_SENT_END")
for attr in attrs:
assert not doc.has_annotation(attr)
assert not doc.has_annotation(attr, require_complete=True)
# The first token (index 0) is always assumed to be a sentence start,
# and ignored by the check in doc.has_annotation
doc[1].is_sent_start = False
for attr in attrs:
assert doc.has_annotation(attr)
assert not doc.has_annotation(attr, require_complete=True)
doc[2].is_sent_start = False
for attr in attrs:
assert doc.has_annotation(attr)
assert doc.has_annotation(attr, require_complete=True)
def test_is_flags_deprecated(en_tokenizer):
doc = en_tokenizer("test")
with pytest.deprecated_call():

View File

@ -420,6 +420,8 @@ cdef class Doc:
cdef int range_start = 0
if attr == "IS_SENT_START" or attr == self.vocab.strings["IS_SENT_START"]:
attr = SENT_START
elif attr == "IS_SENT_END" or attr == self.vocab.strings["IS_SENT_END"]:
attr = SENT_START
attr = intify_attr(attr)
# adjust attributes
if attr == HEAD:

View File

@ -487,8 +487,6 @@ cdef class Token:
RETURNS (bool / None): Whether the token starts a sentence.
None if unknown.
DOCS: https://spacy.io/api/token#is_sent_start
"""
def __get__(self):
if self.c.sent_start == 0:

View File

@ -304,7 +304,7 @@ ancestor is found, e.g. if span excludes a necessary ancestor.
## Doc.has_annotation {#has_annotation tag="method"}
Check whether the doc contains annotation on a token attribute.
Check whether the doc contains annotation on a [`Token` attribute](/api/token#attributes).
<Infobox title="Changed in v3.0" variant="warning">

View File

@ -349,23 +349,6 @@ A sequence containing the token and all the token's syntactic descendants.
| ---------- | ------------------------------------------------------------------------------------ |
| **YIELDS** | A descendant token such that `self.is_ancestor(token)` or `token == self`. ~~Token~~ |
## Token.is_sent_start {#is_sent_start tag="property" new="2"}
A boolean value indicating whether the token starts a sentence. `None` if
unknown. Defaults to `True` for the first token in the `Doc`.
> #### Example
>
> ```python
> doc = nlp("Give it back! He pleaded.")
> assert doc[4].is_sent_start
> assert not doc[5].is_sent_start
> ```
| Name | Description |
| ----------- | ------------------------------------------------------- |
| **RETURNS** | Whether the token starts a sentence. ~~Optional[bool]~~ |
## Token.has_vector {#has_vector tag="property" model="vectors"}
A boolean value indicating whether a word vector is associated with the token.
@ -465,6 +448,8 @@ The L2 norm of the token's vector representation.
| `is_punct` | Is the token punctuation? ~~bool~~ |
| `is_left_punct` | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~ |
| `is_right_punct` | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~ |
| `is_sent_start` | Does the token start a sentence? ~~bool~~ or `None` if unknown. Defaults to `True` for the first token in the `Doc`. |
| `is_sent_end` | Does the token end a sentence? ~~bool~~ or `None` if unknown. |
| `is_space` | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~ |
| `is_bracket` | Is the token a bracket? ~~bool~~ |
| `is_quote` | Is the token a quotation mark? ~~bool~~ |