Remove sentiment extension (#11722)

* remove sentiment attribute

* remove sentiment from docs

* add test for backwards compatibility

* replace from_disk with from_bytes

* Fix docs and format file

* Fix formatting
This commit is contained in:
Edward 2022-11-23 13:09:32 +01:00 committed by GitHub
parent d0fc871a1c
commit e79910d57e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 21 additions and 87 deletions

View File

@ -20,7 +20,6 @@ class Lexeme:
def vector_norm(self) -> float: ... def vector_norm(self) -> float: ...
vector: Floats1d vector: Floats1d
rank: int rank: int
sentiment: float
@property @property
def orth_(self) -> str: ... def orth_(self) -> str: ...
@property @property

View File

@ -173,19 +173,6 @@ cdef class Lexeme:
def __set__(self, value): def __set__(self, value):
self.c.id = value self.c.id = value
property sentiment:
"""RETURNS (float): A scalar value indicating the positivity or
negativity of the lexeme."""
def __get__(self):
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {})
return sentiment_table.get(self.c.orth, 0.0)
def __set__(self, float x):
if "lexeme_sentiment" not in self.vocab.lookups:
self.vocab.lookups.add_table("lexeme_sentiment")
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment")
sentiment_table[self.c.orth] = x
@property @property
def orth_(self): def orth_(self):
"""RETURNS (str): The original verbatim text of the lexeme """RETURNS (str): The original verbatim text of the lexeme

View File

@ -40,7 +40,7 @@ py.test spacy/tests/tokenizer/test_exceptions.py::test_tokenizer_handles_emoji #
To keep the behavior of the tests consistent and predictable, we try to follow a few basic conventions: To keep the behavior of the tests consistent and predictable, we try to follow a few basic conventions:
- **Test names** should follow a pattern of `test_[module]_[tested behaviour]`. For example: `test_tokenizer_keeps_email` or `test_spans_override_sentiment`. - **Test names** should follow a pattern of `test_[module]_[tested behaviour]`. For example: `test_tokenizer_keeps_email`.
- If you're testing for a bug reported in a specific issue, always create a **regression test**. Regression tests should be named `test_issue[ISSUE NUMBER]` and live in the [`regression`](regression) directory. - If you're testing for a bug reported in a specific issue, always create a **regression test**. Regression tests should be named `test_issue[ISSUE NUMBER]` and live in the [`regression`](regression) directory.
- Only use `@pytest.mark.xfail` for tests that **should pass, but currently fail**. To test for desired negative behavior, use `assert not` in your test. - Only use `@pytest.mark.xfail` for tests that **should pass, but currently fail**. To test for desired negative behavior, use `assert not` in your test.
- Very **extensive tests** that take a long time to run should be marked with `@pytest.mark.slow`. If your slow test is testing important behavior, consider adding an additional simpler version. - Very **extensive tests** that take a long time to run should be marked with `@pytest.mark.slow`. If your slow test is testing important behavior, consider adding an additional simpler version.

View File

@ -380,9 +380,7 @@ def test_doc_api_serialize(en_tokenizer, text):
assert [t.text for t in tokens] == [t.text for t in new_tokens] assert [t.text for t in tokens] == [t.text for t in new_tokens]
assert [t.orth for t in tokens] == [t.orth for t in new_tokens] assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
new_tokens = Doc(tokens.vocab).from_bytes( new_tokens = Doc(tokens.vocab).from_bytes(tokens.to_bytes())
tokens.to_bytes(exclude=["sentiment"]), exclude=["sentiment"]
)
assert tokens.text == new_tokens.text assert tokens.text == new_tokens.text
assert [t.text for t in tokens] == [t.text for t in new_tokens] assert [t.text for t in tokens] == [t.text for t in new_tokens]
assert [t.orth for t in tokens] == [t.orth for t in new_tokens] assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
@ -990,3 +988,12 @@ def test_doc_spans_setdefault(en_tokenizer):
assert len(doc.spans["key2"]) == 1 assert len(doc.spans["key2"]) == 1
doc.spans.setdefault("key3", default=SpanGroup(doc, spans=[doc[0:1], doc[1:2]])) doc.spans.setdefault("key3", default=SpanGroup(doc, spans=[doc[0:1], doc[1:2]]))
assert len(doc.spans["key3"]) == 2 assert len(doc.spans["key3"]) == 2
def test_doc_sentiment_from_bytes_v3_to_v4():
"""Test if a doc with sentiment attribute created in v3.x works with '.from_bytes' in v4.x without throwing errors. The sentiment attribute was removed in v4"""
doc_bytes = b"\x89\xa4text\xa5happy\xaaarray_head\x9fGQACKOLMN\xcd\x01\xc4\xcd\x01\xc6I\xcd\x01\xc5JP\xaaarray_body\x85\xc4\x02nd\xc3\xc4\x04type\xa3<u8\xc4\x04kind\xc4\x00\xc4\x05shape\x92\x01\x0f\xc4\x04data\xc4x\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xa4\x9a\xd3\x17\xca\xf0b\x03\xa4\x9a\xd3\x17\xca\xf0b\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\xa9sentiment\xcb?\xf0\x00\x00\x00\x00\x00\x00\xa6tensor\x85\xc4\x02nd\xc3\xc4\x04type\xa3<f4\xc4\x04kind\xc4\x00\xc4\x05shape\x91\x00\xc4\x04data\xc4\x00\xa4cats\x80\xa5spans\xc4\x01\x90\xa7strings\x92\xa0\xa5happy\xb2has_unknown_spaces\xc2"
doc = Doc(Vocab()).from_bytes(doc_bytes)
assert doc.text == "happy"
with pytest.raises(AttributeError):
doc.sentiment == 1.0

View File

@ -305,31 +305,6 @@ def test_span_similarity_match():
assert span1[:1].similarity(doc.vocab["a"]) == 1.0 assert span1[:1].similarity(doc.vocab["a"]) == 1.0
def test_spans_default_sentiment(en_tokenizer):
"""Test span.sentiment property's default averaging behaviour"""
text = "good stuff bad stuff"
tokens = en_tokenizer(text)
tokens.vocab[tokens[0].text].sentiment = 3.0
tokens.vocab[tokens[2].text].sentiment = -2.0
doc = Doc(tokens.vocab, words=[t.text for t in tokens])
assert doc[:2].sentiment == 3.0 / 2
assert doc[-2:].sentiment == -2.0 / 2
assert doc[:-1].sentiment == (3.0 + -2) / 3.0
def test_spans_override_sentiment(en_tokenizer):
"""Test span.sentiment property's default averaging behaviour"""
text = "good stuff bad stuff"
tokens = en_tokenizer(text)
tokens.vocab[tokens[0].text].sentiment = 3.0
tokens.vocab[tokens[2].text].sentiment = -2.0
doc = Doc(tokens.vocab, words=[t.text for t in tokens])
doc.user_span_hooks["sentiment"] = lambda span: 10.0
assert doc[:2].sentiment == 10.0
assert doc[-2:].sentiment == 10.0
assert doc[:-1].sentiment == 10.0
def test_spans_are_hashable(en_tokenizer): def test_spans_are_hashable(en_tokenizer):
"""Test spans can be hashed.""" """Test spans can be hashed."""
text = "good stuff bad stuff" text = "good stuff bad stuff"

View File

@ -50,8 +50,6 @@ def test_matcher_from_usage_docs(en_vocab):
def label_sentiment(matcher, doc, i, matches): def label_sentiment(matcher, doc, i, matches):
match_id, start, end = matches[i] match_id, start, end = matches[i]
if doc.vocab.strings[match_id] == "HAPPY":
doc.sentiment += 0.1
span = doc[start:end] span = doc[start:end]
with doc.retokenize() as retokenizer: with doc.retokenize() as retokenizer:
retokenizer.merge(span) retokenizer.merge(span)
@ -61,7 +59,6 @@ def test_matcher_from_usage_docs(en_vocab):
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
matcher.add("HAPPY", pos_patterns, on_match=label_sentiment) matcher.add("HAPPY", pos_patterns, on_match=label_sentiment)
matcher(doc) matcher(doc)
assert doc.sentiment != 0
assert doc[1].norm_ == "happy emoji" assert doc[1].norm_ == "happy emoji"

View File

@ -48,8 +48,6 @@ cdef class Doc:
cdef TokenC* c cdef TokenC* c
cdef public float sentiment
cdef public dict activations cdef public dict activations
cdef public dict user_hooks cdef public dict user_hooks

View File

@ -21,7 +21,6 @@ class Doc:
spans: SpanGroups spans: SpanGroups
max_length: int max_length: int
length: int length: int
sentiment: float
activations: Dict[str, Dict[str, Union[ArrayXd, Ragged]]] activations: Dict[str, Dict[str, Union[ArrayXd, Ragged]]]
cats: Dict[str, float] cats: Dict[str, float]
user_hooks: Dict[str, Callable[..., Any]] user_hooks: Dict[str, Callable[..., Any]]

View File

@ -243,7 +243,6 @@ cdef class Doc:
self.c = data_start + PADDING self.c = data_start + PADDING
self.max_length = size self.max_length = size
self.length = 0 self.length = 0
self.sentiment = 0.0
self.cats = {} self.cats = {}
self.activations = {} self.activations = {}
self.user_hooks = {} self.user_hooks = {}
@ -1270,7 +1269,6 @@ cdef class Doc:
other.tensor = copy.deepcopy(self.tensor) other.tensor = copy.deepcopy(self.tensor)
other.cats = copy.deepcopy(self.cats) other.cats = copy.deepcopy(self.cats)
other.user_data = copy.deepcopy(self.user_data) other.user_data = copy.deepcopy(self.user_data)
other.sentiment = self.sentiment
other.has_unknown_spaces = self.has_unknown_spaces other.has_unknown_spaces = self.has_unknown_spaces
other.user_hooks = dict(self.user_hooks) other.user_hooks = dict(self.user_hooks)
other.user_token_hooks = dict(self.user_token_hooks) other.user_token_hooks = dict(self.user_token_hooks)
@ -1367,7 +1365,6 @@ cdef class Doc:
"text": lambda: self.text, "text": lambda: self.text,
"array_head": lambda: array_head, "array_head": lambda: array_head,
"array_body": lambda: self.to_array(array_head), "array_body": lambda: self.to_array(array_head),
"sentiment": lambda: self.sentiment,
"tensor": lambda: self.tensor, "tensor": lambda: self.tensor,
"cats": lambda: self.cats, "cats": lambda: self.cats,
"spans": lambda: self.spans.to_bytes(), "spans": lambda: self.spans.to_bytes(),
@ -1405,8 +1402,6 @@ cdef class Doc:
for key, value in zip(user_data_keys, user_data_values): for key, value in zip(user_data_keys, user_data_values):
self.user_data[key] = value self.user_data[key] = value
cdef int i, start, end, has_space cdef int i, start, end, has_space
if "sentiment" not in exclude and "sentiment" in msg:
self.sentiment = msg["sentiment"]
if "tensor" not in exclude and "tensor" in msg: if "tensor" not in exclude and "tensor" in msg:
self.tensor = msg["tensor"] self.tensor = msg["tensor"]
if "cats" not in exclude and "cats" in msg: if "cats" not in exclude and "cats" in msg:

View File

@ -82,8 +82,6 @@ class Span:
@property @property
def tensor(self) -> FloatsXd: ... def tensor(self) -> FloatsXd: ...
@property @property
def sentiment(self) -> float: ...
@property
def text(self) -> str: ... def text(self) -> str: ...
@property @property
def text_with_ws(self) -> str: ... def text_with_ws(self) -> str: ...

View File

@ -566,16 +566,6 @@ cdef class Span:
return None return None
return self.doc.tensor[self.start : self.end] return self.doc.tensor[self.start : self.end]
@property
def sentiment(self):
"""RETURNS (float): A scalar value indicating the positivity or
negativity of the span.
"""
if "sentiment" in self.doc.user_span_hooks:
return self.doc.user_span_hooks["sentiment"](self)
else:
return sum([token.sentiment for token in self]) / len(self)
@property @property
def text(self): def text(self):
"""RETURNS (str): The original verbatim text of the span.""" """RETURNS (str): The original verbatim text of the span."""

View File

@ -79,8 +79,6 @@ class Token:
@property @property
def prob(self) -> float: ... def prob(self) -> float: ...
@property @property
def sentiment(self) -> float: ...
@property
def lang(self) -> int: ... def lang(self) -> int: ...
@property @property
def idx(self) -> int: ... def idx(self) -> int: ...

View File

@ -283,14 +283,6 @@ cdef class Token:
"""RETURNS (float): Smoothed log probability estimate of token type.""" """RETURNS (float): Smoothed log probability estimate of token type."""
return self.vocab[self.c.lex.orth].prob return self.vocab[self.c.lex.orth].prob
@property
def sentiment(self):
"""RETURNS (float): A scalar value indicating the positivity or
negativity of the token."""
if "sentiment" in self.doc.user_token_hooks:
return self.doc.user_token_hooks["sentiment"](self)
return self.vocab[self.c.lex.orth].sentiment
@property @property
def lang(self): def lang(self):
"""RETURNS (uint64): ID of the language of the parent document's """RETURNS (uint64): ID of the language of the parent document's

View File

@ -761,7 +761,6 @@ The L2 norm of the document's vector representation.
| `user_data` | A generic storage area, for user custom data. ~~Dict[str, Any]~~ | | `user_data` | A generic storage area, for user custom data. ~~Dict[str, Any]~~ |
| `lang` <Tag variant="new">2.1</Tag> | Language of the document's vocabulary. ~~int~~ | | `lang` <Tag variant="new">2.1</Tag> | Language of the document's vocabulary. ~~int~~ |
| `lang_` <Tag variant="new">2.1</Tag> | Language of the document's vocabulary. ~~str~~ | | `lang_` <Tag variant="new">2.1</Tag> | Language of the document's vocabulary. ~~str~~ |
| `sentiment` | The document's positivity/negativity score, if available. ~~float~~ |
| `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ | | `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ |
| `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ | | `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ |
| `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ | | `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ |
@ -785,7 +784,6 @@ serialization by passing in the string names via the `exclude` argument.
| Name | Description | | Name | Description |
| ------------------ | --------------------------------------------- | | ------------------ | --------------------------------------------- |
| `text` | The value of the `Doc.text` attribute. | | `text` | The value of the `Doc.text` attribute. |
| `sentiment` | The value of the `Doc.sentiment` attribute. |
| `tensor` | The value of the `Doc.tensor` attribute. | | `tensor` | The value of the `Doc.tensor` attribute. |
| `user_data` | The value of the `Doc.user_data` dictionary. | | `user_data` | The value of the `Doc.user_data` dictionary. |
| `user_data_keys` | The keys of the `Doc.user_data` dictionary. | | `user_data_keys` | The keys of the `Doc.user_data` dictionary. |

View File

@ -161,4 +161,3 @@ The L2 norm of the lexeme's vector representation.
| `lang_` | Language of the parent vocabulary. ~~str~~ | | `lang_` | Language of the parent vocabulary. ~~str~~ |
| `prob` | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). ~~float~~ | | `prob` | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). ~~float~~ |
| `cluster` | Brown cluster ID. ~~int~~ | | `cluster` | Brown cluster ID. ~~int~~ |
| `sentiment` | A scalar value indicating the positivity or negativity of the lexeme. ~~float~~ |

View File

@ -565,5 +565,4 @@ overlaps with will be returned.
| `ent_id_` | Alias for `id_`: the span's ID. ~~str~~ | | `ent_id_` | Alias for `id_`: the span's ID. ~~str~~ |
| `id` | The hash value of the span's ID. ~~int~~ | | `id` | The hash value of the span's ID. ~~int~~ |
| `id_` | The span's ID. ~~str~~ | | `id_` | The span's ID. ~~str~~ |
| `sentiment` | A scalar value indicating the positivity or negativity of the span. ~~float~~ |
| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ | | `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |

View File

@ -470,7 +470,6 @@ The L2 norm of the token's vector representation.
| `lang_` | Language of the parent document's vocabulary. ~~str~~ | | `lang_` | Language of the parent document's vocabulary. ~~str~~ |
| `prob` | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~ | | `prob` | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~ |
| `idx` | The character offset of the token within the parent document. ~~int~~ | | `idx` | The character offset of the token within the parent document. ~~int~~ |
| `sentiment` | A scalar value indicating the positivity or negativity of the token. ~~float~~ |
| `lex_id` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ | | `lex_id` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ |
| `rank` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ | | `rank` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ |
| `cluster` | Brown cluster ID. ~~int~~ | | `cluster` | Brown cluster ID. ~~int~~ |

View File

@ -1400,7 +1400,7 @@ separation and makes it easier to ensure backwards compatibility. For example,
if you've implemented your own `.coref` property and spaCy claims it one day, if you've implemented your own `.coref` property and spaCy claims it one day,
it'll break your code. Similarly, just by looking at the code, you'll it'll break your code. Similarly, just by looking at the code, you'll
immediately know what's built-in and what's custom for example, immediately know what's built-in and what's custom for example,
`doc.sentiment` is spaCy, while `doc._.sent_score` isn't. `doc.lang` is spaCy, while `doc._.language` isn't.
</Accordion> </Accordion>

View File

@ -776,6 +776,9 @@ whitespace, making them easy to match as well.
### {executable="true"} ### {executable="true"}
from spacy.lang.en import English from spacy.lang.en import English
from spacy.matcher import Matcher from spacy.matcher import Matcher
from spacy.tokens import Doc
Doc.set_extension("sentiment", default=0.0)
nlp = English() # We only want the tokenizer, so no need to load a pipeline nlp = English() # We only want the tokenizer, so no need to load a pipeline
matcher = Matcher(nlp.vocab) matcher = Matcher(nlp.vocab)
@ -791,9 +794,9 @@ neg_patterns = [[{"ORTH": emoji}] for emoji in neg_emoji]
def label_sentiment(matcher, doc, i, matches): def label_sentiment(matcher, doc, i, matches):
match_id, start, end = matches[i] match_id, start, end = matches[i]
if doc.vocab.strings[match_id] == "HAPPY": # Don't forget to get string! if doc.vocab.strings[match_id] == "HAPPY": # Don't forget to get string!
doc.sentiment += 0.1 # Add 0.1 for positive sentiment doc._.sentiment += 0.1 # Add 0.1 for positive sentiment
elif doc.vocab.strings[match_id] == "SAD": elif doc.vocab.strings[match_id] == "SAD":
doc.sentiment -= 0.1 # Subtract 0.1 for negative sentiment doc._.sentiment -= 0.1 # Subtract 0.1 for negative sentiment
matcher.add("HAPPY", pos_patterns, on_match=label_sentiment) # Add positive pattern matcher.add("HAPPY", pos_patterns, on_match=label_sentiment) # Add positive pattern
matcher.add("SAD", neg_patterns, on_match=label_sentiment) # Add negative pattern matcher.add("SAD", neg_patterns, on_match=label_sentiment) # Add negative pattern
@ -823,16 +826,17 @@ the emoji span will make it available as `span._.emoji_desc`.
```python ```python
from emojipedia import Emojipedia # Installation: pip install emojipedia from emojipedia import Emojipedia # Installation: pip install emojipedia
from spacy.tokens import Span # Get the global Span object from spacy.tokens import Doc, Span # Get the global Doc and Span object
Span.set_extension("emoji_desc", default=None) # Register the custom attribute Span.set_extension("emoji_desc", default=None) # Register the custom attribute
Doc.set_extension("sentiment", default=0.0)
def label_sentiment(matcher, doc, i, matches): def label_sentiment(matcher, doc, i, matches):
match_id, start, end = matches[i] match_id, start, end = matches[i]
if doc.vocab.strings[match_id] == "HAPPY": # Don't forget to get string! if doc.vocab.strings[match_id] == "HAPPY": # Don't forget to get string!
doc.sentiment += 0.1 # Add 0.1 for positive sentiment doc._.sentiment += 0.1 # Add 0.1 for positive sentiment
elif doc.vocab.strings[match_id] == "SAD": elif doc.vocab.strings[match_id] == "SAD":
doc.sentiment -= 0.1 # Subtract 0.1 for negative sentiment doc._.sentiment -= 0.1 # Subtract 0.1 for negative sentiment
span = doc[start:end] span = doc[start:end]
emoji = Emojipedia.search(span[0].text) # Get data for emoji emoji = Emojipedia.search(span[0].text) # Get data for emoji
span._.emoji_desc = emoji.title # Assign emoji description span._.emoji_desc = emoji.title # Assign emoji description