mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-13 05:07:03 +03:00
Token sent attributes more consistent (#10164)
* remove duplicate line * add sent start/end token attributes to the docs * let has_annotation work with IS_SENT_END * elif instead of if * add has_annotation test for sent attributes * fix typo * remove duplicate is_sent_start entry in docs
This commit is contained in:
parent
836f689cc7
commit
deb143fa70
|
@ -310,7 +310,6 @@ GLOSSARY = {
|
||||||
"re": "repeated element",
|
"re": "repeated element",
|
||||||
"rs": "reported speech",
|
"rs": "reported speech",
|
||||||
"sb": "subject",
|
"sb": "subject",
|
||||||
"sb": "subject",
|
|
||||||
"sbp": "passivized subject (PP)",
|
"sbp": "passivized subject (PP)",
|
||||||
"sp": "subject or predicate",
|
"sp": "subject or predicate",
|
||||||
"svp": "separable verb prefix",
|
"svp": "separable verb prefix",
|
||||||
|
|
|
@ -684,6 +684,7 @@ def test_has_annotation(en_vocab):
|
||||||
attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "HEAD", "ENT_IOB", "ENT_TYPE")
|
attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "HEAD", "ENT_IOB", "ENT_TYPE")
|
||||||
for attr in attrs:
|
for attr in attrs:
|
||||||
assert not doc.has_annotation(attr)
|
assert not doc.has_annotation(attr)
|
||||||
|
assert not doc.has_annotation(attr, require_complete=True)
|
||||||
|
|
||||||
doc[0].tag_ = "A"
|
doc[0].tag_ = "A"
|
||||||
doc[0].pos_ = "X"
|
doc[0].pos_ = "X"
|
||||||
|
@ -709,6 +710,27 @@ def test_has_annotation(en_vocab):
|
||||||
assert doc.has_annotation(attr, require_complete=True)
|
assert doc.has_annotation(attr, require_complete=True)
|
||||||
|
|
||||||
|
|
||||||
|
def test_has_annotation_sents(en_vocab):
|
||||||
|
doc = Doc(en_vocab, words=["Hello", "beautiful", "world"])
|
||||||
|
attrs = ("SENT_START", "IS_SENT_START", "IS_SENT_END")
|
||||||
|
for attr in attrs:
|
||||||
|
assert not doc.has_annotation(attr)
|
||||||
|
assert not doc.has_annotation(attr, require_complete=True)
|
||||||
|
|
||||||
|
# The first token (index 0) is always assumed to be a sentence start,
|
||||||
|
# and ignored by the check in doc.has_annotation
|
||||||
|
|
||||||
|
doc[1].is_sent_start = False
|
||||||
|
for attr in attrs:
|
||||||
|
assert doc.has_annotation(attr)
|
||||||
|
assert not doc.has_annotation(attr, require_complete=True)
|
||||||
|
|
||||||
|
doc[2].is_sent_start = False
|
||||||
|
for attr in attrs:
|
||||||
|
assert doc.has_annotation(attr)
|
||||||
|
assert doc.has_annotation(attr, require_complete=True)
|
||||||
|
|
||||||
|
|
||||||
def test_is_flags_deprecated(en_tokenizer):
|
def test_is_flags_deprecated(en_tokenizer):
|
||||||
doc = en_tokenizer("test")
|
doc = en_tokenizer("test")
|
||||||
with pytest.deprecated_call():
|
with pytest.deprecated_call():
|
||||||
|
|
|
@ -420,6 +420,8 @@ cdef class Doc:
|
||||||
cdef int range_start = 0
|
cdef int range_start = 0
|
||||||
if attr == "IS_SENT_START" or attr == self.vocab.strings["IS_SENT_START"]:
|
if attr == "IS_SENT_START" or attr == self.vocab.strings["IS_SENT_START"]:
|
||||||
attr = SENT_START
|
attr = SENT_START
|
||||||
|
elif attr == "IS_SENT_END" or attr == self.vocab.strings["IS_SENT_END"]:
|
||||||
|
attr = SENT_START
|
||||||
attr = intify_attr(attr)
|
attr = intify_attr(attr)
|
||||||
# adjust attributes
|
# adjust attributes
|
||||||
if attr == HEAD:
|
if attr == HEAD:
|
||||||
|
|
|
@ -487,8 +487,6 @@ cdef class Token:
|
||||||
|
|
||||||
RETURNS (bool / None): Whether the token starts a sentence.
|
RETURNS (bool / None): Whether the token starts a sentence.
|
||||||
None if unknown.
|
None if unknown.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/token#is_sent_start
|
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if self.c.sent_start == 0:
|
if self.c.sent_start == 0:
|
||||||
|
|
|
@ -304,7 +304,7 @@ ancestor is found, e.g. if span excludes a necessary ancestor.
|
||||||
|
|
||||||
## Doc.has_annotation {#has_annotation tag="method"}
|
## Doc.has_annotation {#has_annotation tag="method"}
|
||||||
|
|
||||||
Check whether the doc contains annotation on a token attribute.
|
Check whether the doc contains annotation on a [`Token` attribute](/api/token#attributes).
|
||||||
|
|
||||||
<Infobox title="Changed in v3.0" variant="warning">
|
<Infobox title="Changed in v3.0" variant="warning">
|
||||||
|
|
||||||
|
|
|
@ -349,23 +349,6 @@ A sequence containing the token and all the token's syntactic descendants.
|
||||||
| ---------- | ------------------------------------------------------------------------------------ |
|
| ---------- | ------------------------------------------------------------------------------------ |
|
||||||
| **YIELDS** | A descendant token such that `self.is_ancestor(token)` or `token == self`. ~~Token~~ |
|
| **YIELDS** | A descendant token such that `self.is_ancestor(token)` or `token == self`. ~~Token~~ |
|
||||||
|
|
||||||
## Token.is_sent_start {#is_sent_start tag="property" new="2"}
|
|
||||||
|
|
||||||
A boolean value indicating whether the token starts a sentence. `None` if
|
|
||||||
unknown. Defaults to `True` for the first token in the `Doc`.
|
|
||||||
|
|
||||||
> #### Example
|
|
||||||
>
|
|
||||||
> ```python
|
|
||||||
> doc = nlp("Give it back! He pleaded.")
|
|
||||||
> assert doc[4].is_sent_start
|
|
||||||
> assert not doc[5].is_sent_start
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Name | Description |
|
|
||||||
| ----------- | ------------------------------------------------------- |
|
|
||||||
| **RETURNS** | Whether the token starts a sentence. ~~Optional[bool]~~ |
|
|
||||||
|
|
||||||
## Token.has_vector {#has_vector tag="property" model="vectors"}
|
## Token.has_vector {#has_vector tag="property" model="vectors"}
|
||||||
|
|
||||||
A boolean value indicating whether a word vector is associated with the token.
|
A boolean value indicating whether a word vector is associated with the token.
|
||||||
|
@ -465,6 +448,8 @@ The L2 norm of the token's vector representation.
|
||||||
| `is_punct` | Is the token punctuation? ~~bool~~ |
|
| `is_punct` | Is the token punctuation? ~~bool~~ |
|
||||||
| `is_left_punct` | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~ |
|
| `is_left_punct` | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~ |
|
||||||
| `is_right_punct` | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~ |
|
| `is_right_punct` | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~ |
|
||||||
|
| `is_sent_start` | Does the token start a sentence? ~~bool~~ or `None` if unknown. Defaults to `True` for the first token in the `Doc`. |
|
||||||
|
| `is_sent_end` | Does the token end a sentence? ~~bool~~ or `None` if unknown. |
|
||||||
| `is_space` | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~ |
|
| `is_space` | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~ |
|
||||||
| `is_bracket` | Is the token a bracket? ~~bool~~ |
|
| `is_bracket` | Is the token a bracket? ~~bool~~ |
|
||||||
| `is_quote` | Is the token a quotation mark? ~~bool~~ |
|
| `is_quote` | Is the token a quotation mark? ~~bool~~ |
|
||||||
|
|
Loading…
Reference in New Issue
Block a user