From e79910d57ec4f2f5d02e0c8d137a7ae8cd608135 Mon Sep 17 00:00:00 2001
From: Edward <43848523+thomashacker@users.noreply.github.com>
Date: Wed, 23 Nov 2022 13:09:32 +0100
Subject: [PATCH] Remove sentiment extension (#11722)

* remove sentiment attribute

* remove sentiment from docs

* add test for backwards compatibility

* replace from_disk with from_bytes

* Fix docs and format file

* Fix formatting
---
 spacy/lexeme.pyi                           |  1 -
 spacy/lexeme.pyx                           | 13 -----------
 spacy/tests/README.md                      |  2 +-
 spacy/tests/doc/test_doc_api.py            | 13 ++++++++---
 spacy/tests/doc/test_span.py               | 25 ----------------------
 spacy/tests/matcher/test_matcher_api.py    |  3 ---
 spacy/tokens/doc.pxd                       |  2 --
 spacy/tokens/doc.pyi                       |  1 -
 spacy/tokens/doc.pyx                       |  5 -----
 spacy/tokens/span.pyi                      |  2 --
 spacy/tokens/span.pyx                      | 10 ---------
 spacy/tokens/token.pyi                     |  2 --
 spacy/tokens/token.pyx                     |  8 -------
 website/docs/api/doc.md                    |  2 --
 website/docs/api/lexeme.md                 |  1 -
 website/docs/api/span.md                   |  1 -
 website/docs/api/token.md                  |  1 -
 website/docs/usage/processing-pipelines.md |  2 +-
 website/docs/usage/rule-based-matching.md  | 14 +++++++-----
 19 files changed, 21 insertions(+), 87 deletions(-)

diff --git a/spacy/lexeme.pyi b/spacy/lexeme.pyi
index 4fcaa82cf..4942b18aa 100644
--- a/spacy/lexeme.pyi
+++ b/spacy/lexeme.pyi
@@ -20,7 +20,6 @@ class Lexeme:
     def vector_norm(self) -> float: ...
     vector: Floats1d
     rank: int
-    sentiment: float
     @property
     def orth_(self) -> str: ...
     @property
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 6c66effde..73bf28dc2 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -173,19 +173,6 @@ cdef class Lexeme:
         def __set__(self, value):
             self.c.id = value
 
-    property sentiment:
-        """RETURNS (float): A scalar value indicating the positivity or
-            negativity of the lexeme."""
-        def __get__(self):
-            sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {})
-            return sentiment_table.get(self.c.orth, 0.0)
-
-        def __set__(self, float x):
-            if "lexeme_sentiment" not in self.vocab.lookups:
-                self.vocab.lookups.add_table("lexeme_sentiment")
-            sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment")
-            sentiment_table[self.c.orth] = x
-
     @property
     def orth_(self):
         """RETURNS (str): The original verbatim text of the lexeme
diff --git a/spacy/tests/README.md b/spacy/tests/README.md
index 82fabcc77..f3c96a39e 100644
--- a/spacy/tests/README.md
+++ b/spacy/tests/README.md
@@ -40,7 +40,7 @@ py.test spacy/tests/tokenizer/test_exceptions.py::test_tokenizer_handles_emoji #
 
 To keep the behavior of the tests consistent and predictable, we try to follow a few basic conventions:
 
-- **Test names** should follow a pattern of `test_[module]_[tested behaviour]`. For example: `test_tokenizer_keeps_email` or `test_spans_override_sentiment`.
+- **Test names** should follow a pattern of `test_[module]_[tested behaviour]`. For example: `test_tokenizer_keeps_email`.
 - If you're testing for a bug reported in a specific issue, always create a **regression test**. Regression tests should be named `test_issue[ISSUE NUMBER]` and live in the [`regression`](regression) directory.
 - Only use `@pytest.mark.xfail` for tests that **should pass, but currently fail**. To test for desired negative behavior, use `assert not` in your test.
 - Very **extensive tests** that take a long time to run should be marked with `@pytest.mark.slow`. If your slow test is testing important behavior, consider adding an additional simpler version.
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 38003dea9..f77d54493 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -380,9 +380,7 @@ def test_doc_api_serialize(en_tokenizer, text):
     assert [t.text for t in tokens] == [t.text for t in new_tokens]
     assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
 
-    new_tokens = Doc(tokens.vocab).from_bytes(
-        tokens.to_bytes(exclude=["sentiment"]), exclude=["sentiment"]
-    )
+    new_tokens = Doc(tokens.vocab).from_bytes(tokens.to_bytes())
     assert tokens.text == new_tokens.text
     assert [t.text for t in tokens] == [t.text for t in new_tokens]
     assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
@@ -990,3 +988,12 @@ def test_doc_spans_setdefault(en_tokenizer):
     assert len(doc.spans["key2"]) == 1
     doc.spans.setdefault("key3", default=SpanGroup(doc, spans=[doc[0:1], doc[1:2]]))
     assert len(doc.spans["key3"]) == 2
+
+
+def test_doc_sentiment_from_bytes_v3_to_v4():
+    """Test if a doc with sentiment attribute created in v3.x works with '.from_bytes' in v4.x without throwing errors. The sentiment attribute was removed in v4"""
+    doc_bytes = b"\x89\xa4text\xa5happy\xaaarray_head\x9fGQACKOLMN\xcd\x01\xc4\xcd\x01\xc6I\xcd\x01\xc5JP\xaaarray_body\x85\xc4\x02nd\xc3\xc4\x04type\xa3<u8\xc4\x04kind\xc4\x00\xc4\x05shape\x92\x01\x0f\xc4\x04data\xc4x\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xa4\x9a\xd3\x17\xca\xf0b\x03\xa4\x9a\xd3\x17\xca\xf0b\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\xa9sentiment\xcb?\xf0\x00\x00\x00\x00\x00\x00\xa6tensor\x85\xc4\x02nd\xc3\xc4\x04type\xa3<f4\xc4\x04kind\xc4\x00\xc4\x05shape\x91\x00\xc4\x04data\xc4\x00\xa4cats\x80\xa5spans\xc4\x01\x90\xa7strings\x92\xa0\xa5happy\xb2has_unknown_spaces\xc2"
+    doc = Doc(Vocab()).from_bytes(doc_bytes)
+    assert doc.text == "happy"
+    with pytest.raises(AttributeError):
+        doc.sentiment == 1.0
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 1a2f3cdcd..21d247b74 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -305,31 +305,6 @@ def test_span_similarity_match():
         assert span1[:1].similarity(doc.vocab["a"]) == 1.0
 
 
-def test_spans_default_sentiment(en_tokenizer):
-    """Test span.sentiment property's default averaging behaviour"""
-    text = "good stuff bad stuff"
-    tokens = en_tokenizer(text)
-    tokens.vocab[tokens[0].text].sentiment = 3.0
-    tokens.vocab[tokens[2].text].sentiment = -2.0
-    doc = Doc(tokens.vocab, words=[t.text for t in tokens])
-    assert doc[:2].sentiment == 3.0 / 2
-    assert doc[-2:].sentiment == -2.0 / 2
-    assert doc[:-1].sentiment == (3.0 + -2) / 3.0
-
-
-def test_spans_override_sentiment(en_tokenizer):
-    """Test span.sentiment property's default averaging behaviour"""
-    text = "good stuff bad stuff"
-    tokens = en_tokenizer(text)
-    tokens.vocab[tokens[0].text].sentiment = 3.0
-    tokens.vocab[tokens[2].text].sentiment = -2.0
-    doc = Doc(tokens.vocab, words=[t.text for t in tokens])
-    doc.user_span_hooks["sentiment"] = lambda span: 10.0
-    assert doc[:2].sentiment == 10.0
-    assert doc[-2:].sentiment == 10.0
-    assert doc[:-1].sentiment == 10.0
-
-
 def test_spans_are_hashable(en_tokenizer):
     """Test spans can be hashed."""
     text = "good stuff bad stuff"
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index 8a594ed7e..2f6cfdbd9 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -50,8 +50,6 @@ def test_matcher_from_usage_docs(en_vocab):
 
     def label_sentiment(matcher, doc, i, matches):
         match_id, start, end = matches[i]
-        if doc.vocab.strings[match_id] == "HAPPY":
-            doc.sentiment += 0.1
         span = doc[start:end]
         with doc.retokenize() as retokenizer:
             retokenizer.merge(span)
@@ -61,7 +59,6 @@ def test_matcher_from_usage_docs(en_vocab):
     matcher = Matcher(en_vocab)
     matcher.add("HAPPY", pos_patterns, on_match=label_sentiment)
     matcher(doc)
-    assert doc.sentiment != 0
     assert doc[1].norm_ == "happy emoji"
 
 
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index 83a940cbb..b53c75a2f 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -48,8 +48,6 @@ cdef class Doc:
 
     cdef TokenC* c
 
-    cdef public float sentiment
-
     cdef public dict activations
 
     cdef public dict user_hooks
diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index b208cc871..1c7c18bf3 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -21,7 +21,6 @@ class Doc:
     spans: SpanGroups
     max_length: int
     length: int
-    sentiment: float
     activations: Dict[str, Dict[str, Union[ArrayXd, Ragged]]]
     cats: Dict[str, float]
     user_hooks: Dict[str, Callable[..., Any]]
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 2451f72dd..ee6b6041c 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -243,7 +243,6 @@ cdef class Doc:
         self.c = data_start + PADDING
         self.max_length = size
         self.length = 0
-        self.sentiment = 0.0
         self.cats = {}
         self.activations = {}
         self.user_hooks = {}
@@ -1270,7 +1269,6 @@ cdef class Doc:
         other.tensor = copy.deepcopy(self.tensor)
         other.cats = copy.deepcopy(self.cats)
         other.user_data = copy.deepcopy(self.user_data)
-        other.sentiment = self.sentiment
         other.has_unknown_spaces = self.has_unknown_spaces
         other.user_hooks = dict(self.user_hooks)
         other.user_token_hooks = dict(self.user_token_hooks)
@@ -1367,7 +1365,6 @@ cdef class Doc:
             "text": lambda: self.text,
             "array_head": lambda: array_head,
             "array_body": lambda: self.to_array(array_head),
-            "sentiment": lambda: self.sentiment,
             "tensor": lambda: self.tensor,
             "cats": lambda: self.cats,
             "spans": lambda: self.spans.to_bytes(),
@@ -1405,8 +1402,6 @@ cdef class Doc:
             for key, value in zip(user_data_keys, user_data_values):
                 self.user_data[key] = value
         cdef int i, start, end, has_space
-        if "sentiment" not in exclude and "sentiment" in msg:
-            self.sentiment = msg["sentiment"]
         if "tensor" not in exclude and "tensor" in msg:
             self.tensor = msg["tensor"]
         if "cats" not in exclude and "cats" in msg:
diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi
index 28b627c32..abda49361 100644
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@@ -82,8 +82,6 @@ class Span:
     @property
     def tensor(self) -> FloatsXd: ...
     @property
-    def sentiment(self) -> float: ...
-    @property
     def text(self) -> str: ...
     @property
     def text_with_ws(self) -> str: ...
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 89d9727e9..5530dd127 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -566,16 +566,6 @@ cdef class Span:
             return None
         return self.doc.tensor[self.start : self.end]
 
-    @property
-    def sentiment(self):
-        """RETURNS (float): A scalar value indicating the positivity or
-            negativity of the span.
-        """
-        if "sentiment" in self.doc.user_span_hooks:
-            return self.doc.user_span_hooks["sentiment"](self)
-        else:
-            return sum([token.sentiment for token in self]) / len(self)
-
     @property
     def text(self):
         """RETURNS (str): The original verbatim text of the span."""
diff --git a/spacy/tokens/token.pyi b/spacy/tokens/token.pyi
index bd585d034..6de7e984a 100644
--- a/spacy/tokens/token.pyi
+++ b/spacy/tokens/token.pyi
@@ -79,8 +79,6 @@ class Token:
     @property
     def prob(self) -> float: ...
     @property
-    def sentiment(self) -> float: ...
-    @property
     def lang(self) -> int: ...
     @property
     def idx(self) -> int: ...
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index cee903f48..64c707acd 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -283,14 +283,6 @@ cdef class Token:
         """RETURNS (float): Smoothed log probability estimate of token type."""
         return self.vocab[self.c.lex.orth].prob
 
-    @property
-    def sentiment(self):
-        """RETURNS (float): A scalar value indicating the positivity or
-            negativity of the token."""
-        if "sentiment" in self.doc.user_token_hooks:
-            return self.doc.user_token_hooks["sentiment"](self)
-        return self.vocab[self.c.lex.orth].sentiment
-
     @property
     def lang(self):
         """RETURNS (uint64): ID of the language of the parent document's
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index 19eb5052e..433134278 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -761,7 +761,6 @@ The L2 norm of the document's vector representation.
 | `user_data`                                | A generic storage area, for user custom data. ~~Dict[str, Any]~~                                                                               |
 | `lang` <Tag variant="new">2.1</Tag>        | Language of the document's vocabulary. ~~int~~                                                                                                 |
 | `lang_` <Tag variant="new">2.1</Tag>       | Language of the document's vocabulary. ~~str~~                                                                                                 |
-| `sentiment`                                | The document's positivity/negativity score, if available. ~~float~~                                                                            |
 | `user_hooks`                               | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~                                                      |
 | `user_token_hooks`                         | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~                                              |
 | `user_span_hooks`                          | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~                                               |
@@ -785,7 +784,6 @@ serialization by passing in the string names via the `exclude` argument.
 | Name               | Description                                   |
 | ------------------ | --------------------------------------------- |
 | `text`             | The value of the `Doc.text` attribute.        |
-| `sentiment`        | The value of the `Doc.sentiment` attribute.   |
 | `tensor`           | The value of the `Doc.tensor` attribute.      |
 | `user_data`        | The value of the `Doc.user_data` dictionary.  |
 | `user_data_keys`   | The keys of the `Doc.user_data` dictionary.   |
diff --git a/website/docs/api/lexeme.md b/website/docs/api/lexeme.md
index c5d4b7544..db1aba7aa 100644
--- a/website/docs/api/lexeme.md
+++ b/website/docs/api/lexeme.md
@@ -161,4 +161,3 @@ The L2 norm of the lexeme's vector representation.
 | `lang_`                                      | Language of the parent vocabulary. ~~str~~                                                                                                                                                                                                                           |
 | `prob`                                       | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). ~~float~~                                                                                                                                                 |
 | `cluster`                                    | Brown cluster ID. ~~int~~                                                                                                                                                                                                                                            |
-| `sentiment`                                  | A scalar value indicating the positivity or negativity of the lexeme. ~~float~~                                                                                                                                                                                      |
diff --git a/website/docs/api/span.md b/website/docs/api/span.md
index be522c31f..9bca0c410 100644
--- a/website/docs/api/span.md
+++ b/website/docs/api/span.md
@@ -565,5 +565,4 @@ overlaps with will be returned.
 | `ent_id_`                               | Alias for `id_`: the span's ID. ~~str~~                                                                                       |
 | `id`                                    | The hash value of the span's ID. ~~int~~                                                                                      |
 | `id_`                                   | The span's ID. ~~str~~                                                                                                        |
-| `sentiment`                             | A scalar value indicating the positivity or negativity of the span. ~~float~~                                                 |
 | `_`                                     | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
diff --git a/website/docs/api/token.md b/website/docs/api/token.md
index 73447e4d3..6c35d47b1 100644
--- a/website/docs/api/token.md
+++ b/website/docs/api/token.md
@@ -470,7 +470,6 @@ The L2 norm of the token's vector representation.
 | `lang_`                                      | Language of the parent document's vocabulary. ~~str~~                                                                                                                                                                                                                |
 | `prob`                                       | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~                                                                                                                                                      |
 | `idx`                                        | The character offset of the token within the parent document. ~~int~~                                                                                                                                                                                                |
-| `sentiment`                                  | A scalar value indicating the positivity or negativity of the token. ~~float~~                                                                                                                                                                                       |
 | `lex_id`                                     | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                 |
 | `rank`                                       | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                 |
 | `cluster`                                    | Brown cluster ID. ~~int~~                                                                                                                                                                                                                                            |
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index 2463b523f..67c88700d 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -1400,7 +1400,7 @@ separation and makes it easier to ensure backwards compatibility. For example,
 if you've implemented your own `.coref` property and spaCy claims it one day,
 it'll break your code. Similarly, just by looking at the code, you'll
 immediately know what's built-in and what's custom – for example,
-`doc.sentiment` is spaCy, while `doc._.sent_score` isn't.
+`doc.lang` is spaCy, while `doc._.language` isn't.
 
 </Accordion>
 
diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md
index 8e55d54d6..77461c495 100644
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@@ -776,6 +776,9 @@ whitespace, making them easy to match as well.
 ### {executable="true"}
 from spacy.lang.en import English
 from spacy.matcher import Matcher
+from spacy.tokens import Doc
+
+Doc.set_extension("sentiment", default=0.0)
 
 nlp = English()  # We only want the tokenizer, so no need to load a pipeline
 matcher = Matcher(nlp.vocab)
@@ -791,9 +794,9 @@ neg_patterns = [[{"ORTH": emoji}] for emoji in neg_emoji]
 def label_sentiment(matcher, doc, i, matches):
     match_id, start, end = matches[i]
     if doc.vocab.strings[match_id] == "HAPPY":  # Don't forget to get string!
-        doc.sentiment += 0.1  # Add 0.1 for positive sentiment
+        doc._.sentiment += 0.1  # Add 0.1 for positive sentiment
     elif doc.vocab.strings[match_id] == "SAD":
-        doc.sentiment -= 0.1  # Subtract 0.1 for negative sentiment
+        doc._.sentiment -= 0.1  # Subtract 0.1 for negative sentiment
 
 matcher.add("HAPPY", pos_patterns, on_match=label_sentiment)  # Add positive pattern
 matcher.add("SAD", neg_patterns, on_match=label_sentiment)  # Add negative pattern
@@ -823,16 +826,17 @@ the emoji span will make it available as `span._.emoji_desc`.
 
 ```python
 from emojipedia import Emojipedia  # Installation: pip install emojipedia
-from spacy.tokens import Span  # Get the global Span object
+from spacy.tokens import Doc, Span  # Get the global Doc and Span object
 
 Span.set_extension("emoji_desc", default=None)  # Register the custom attribute
+Doc.set_extension("sentiment", default=0.0)
 
 def label_sentiment(matcher, doc, i, matches):
     match_id, start, end = matches[i]
     if doc.vocab.strings[match_id] == "HAPPY":  # Don't forget to get string!
-        doc.sentiment += 0.1  # Add 0.1 for positive sentiment
+        doc._.sentiment += 0.1  # Add 0.1 for positive sentiment
     elif doc.vocab.strings[match_id] == "SAD":
-        doc.sentiment -= 0.1  # Subtract 0.1 for negative sentiment
+        doc._.sentiment -= 0.1  # Subtract 0.1 for negative sentiment
     span = doc[start:end]
     emoji = Emojipedia.search(span[0].text)  # Get data for emoji
     span._.emoji_desc = emoji.title  # Assign emoji description