From e79910d57ec4f2f5d02e0c8d137a7ae8cd608135 Mon Sep 17 00:00:00 2001 From: Edward <43848523+thomashacker@users.noreply.github.com> Date: Wed, 23 Nov 2022 13:09:32 +0100 Subject: [PATCH] Remove sentiment extension (#11722) * remove sentiment attribute * remove sentiment from docs * add test for backwards compatibility * replace from_disk with from_bytes * Fix docs and format file * Fix formatting --- spacy/lexeme.pyi | 1 - spacy/lexeme.pyx | 13 ----------- spacy/tests/README.md | 2 +- spacy/tests/doc/test_doc_api.py | 13 ++++++++--- spacy/tests/doc/test_span.py | 25 ---------------------- spacy/tests/matcher/test_matcher_api.py | 3 --- spacy/tokens/doc.pxd | 2 -- spacy/tokens/doc.pyi | 1 - spacy/tokens/doc.pyx | 5 ----- spacy/tokens/span.pyi | 2 -- spacy/tokens/span.pyx | 10 --------- spacy/tokens/token.pyi | 2 -- spacy/tokens/token.pyx | 8 ------- website/docs/api/doc.md | 2 -- website/docs/api/lexeme.md | 1 - website/docs/api/span.md | 1 - website/docs/api/token.md | 1 - website/docs/usage/processing-pipelines.md | 2 +- website/docs/usage/rule-based-matching.md | 14 +++++++----- 19 files changed, 21 insertions(+), 87 deletions(-) diff --git a/spacy/lexeme.pyi b/spacy/lexeme.pyi index 4fcaa82cf..4942b18aa 100644 --- a/spacy/lexeme.pyi +++ b/spacy/lexeme.pyi @@ -20,7 +20,6 @@ class Lexeme: def vector_norm(self) -> float: ... vector: Floats1d rank: int - sentiment: float @property def orth_(self) -> str: ... @property diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 6c66effde..73bf28dc2 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -173,19 +173,6 @@ cdef class Lexeme: def __set__(self, value): self.c.id = value - property sentiment: - """RETURNS (float): A scalar value indicating the positivity or - negativity of the lexeme.""" - def __get__(self): - sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {}) - return sentiment_table.get(self.c.orth, 0.0) - - def __set__(self, float x): - if "lexeme_sentiment" not in self.vocab.lookups: - self.vocab.lookups.add_table("lexeme_sentiment") - sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment") - sentiment_table[self.c.orth] = x - @property def orth_(self): """RETURNS (str): The original verbatim text of the lexeme diff --git a/spacy/tests/README.md b/spacy/tests/README.md index 82fabcc77..f3c96a39e 100644 --- a/spacy/tests/README.md +++ b/spacy/tests/README.md @@ -40,7 +40,7 @@ py.test spacy/tests/tokenizer/test_exceptions.py::test_tokenizer_handles_emoji # To keep the behavior of the tests consistent and predictable, we try to follow a few basic conventions: -- **Test names** should follow a pattern of `test_[module]_[tested behaviour]`. For example: `test_tokenizer_keeps_email` or `test_spans_override_sentiment`. +- **Test names** should follow a pattern of `test_[module]_[tested behaviour]`. For example: `test_tokenizer_keeps_email`. - If you're testing for a bug reported in a specific issue, always create a **regression test**. Regression tests should be named `test_issue[ISSUE NUMBER]` and live in the [`regression`](regression) directory. - Only use `@pytest.mark.xfail` for tests that **should pass, but currently fail**. To test for desired negative behavior, use `assert not` in your test. - Very **extensive tests** that take a long time to run should be marked with `@pytest.mark.slow`. If your slow test is testing important behavior, consider adding an additional simpler version. diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 38003dea9..f77d54493 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -380,9 +380,7 @@ def test_doc_api_serialize(en_tokenizer, text): assert [t.text for t in tokens] == [t.text for t in new_tokens] assert [t.orth for t in tokens] == [t.orth for t in new_tokens] - new_tokens = Doc(tokens.vocab).from_bytes( - tokens.to_bytes(exclude=["sentiment"]), exclude=["sentiment"] - ) + new_tokens = Doc(tokens.vocab).from_bytes(tokens.to_bytes()) assert tokens.text == new_tokens.text assert [t.text for t in tokens] == [t.text for t in new_tokens] assert [t.orth for t in tokens] == [t.orth for t in new_tokens] @@ -990,3 +988,12 @@ def test_doc_spans_setdefault(en_tokenizer): assert len(doc.spans["key2"]) == 1 doc.spans.setdefault("key3", default=SpanGroup(doc, spans=[doc[0:1], doc[1:2]])) assert len(doc.spans["key3"]) == 2 + + +def test_doc_sentiment_from_bytes_v3_to_v4(): + """Test if a doc with sentiment attribute created in v3.x works with '.from_bytes' in v4.x without throwing errors. The sentiment attribute was removed in v4""" + doc_bytes = b"\x89\xa4text\xa5happy\xaaarray_head\x9fGQACKOLMN\xcd\x01\xc4\xcd\x01\xc6I\xcd\x01\xc5JP\xaaarray_body\x85\xc4\x02nd\xc3\xc4\x04type\xa3 FloatsXd: ... @property - def sentiment(self) -> float: ... - @property def text(self) -> str: ... @property def text_with_ws(self) -> str: ... diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 89d9727e9..5530dd127 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -566,16 +566,6 @@ cdef class Span: return None return self.doc.tensor[self.start : self.end] - @property - def sentiment(self): - """RETURNS (float): A scalar value indicating the positivity or - negativity of the span. - """ - if "sentiment" in self.doc.user_span_hooks: - return self.doc.user_span_hooks["sentiment"](self) - else: - return sum([token.sentiment for token in self]) / len(self) - @property def text(self): """RETURNS (str): The original verbatim text of the span.""" diff --git a/spacy/tokens/token.pyi b/spacy/tokens/token.pyi index bd585d034..6de7e984a 100644 --- a/spacy/tokens/token.pyi +++ b/spacy/tokens/token.pyi @@ -79,8 +79,6 @@ class Token: @property def prob(self) -> float: ... @property - def sentiment(self) -> float: ... - @property def lang(self) -> int: ... @property def idx(self) -> int: ... diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index cee903f48..64c707acd 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -283,14 +283,6 @@ cdef class Token: """RETURNS (float): Smoothed log probability estimate of token type.""" return self.vocab[self.c.lex.orth].prob - @property - def sentiment(self): - """RETURNS (float): A scalar value indicating the positivity or - negativity of the token.""" - if "sentiment" in self.doc.user_token_hooks: - return self.doc.user_token_hooks["sentiment"](self) - return self.vocab[self.c.lex.orth].sentiment - @property def lang(self): """RETURNS (uint64): ID of the language of the parent document's diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 19eb5052e..433134278 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -761,7 +761,6 @@ The L2 norm of the document's vector representation. | `user_data` | A generic storage area, for user custom data. ~~Dict[str, Any]~~ | | `lang` 2.1 | Language of the document's vocabulary. ~~int~~ | | `lang_` 2.1 | Language of the document's vocabulary. ~~str~~ | -| `sentiment` | The document's positivity/negativity score, if available. ~~float~~ | | `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ | | `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ | | `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ | @@ -785,7 +784,6 @@ serialization by passing in the string names via the `exclude` argument. | Name | Description | | ------------------ | --------------------------------------------- | | `text` | The value of the `Doc.text` attribute. | -| `sentiment` | The value of the `Doc.sentiment` attribute. | | `tensor` | The value of the `Doc.tensor` attribute. | | `user_data` | The value of the `Doc.user_data` dictionary. | | `user_data_keys` | The keys of the `Doc.user_data` dictionary. | diff --git a/website/docs/api/lexeme.md b/website/docs/api/lexeme.md index c5d4b7544..db1aba7aa 100644 --- a/website/docs/api/lexeme.md +++ b/website/docs/api/lexeme.md @@ -161,4 +161,3 @@ The L2 norm of the lexeme's vector representation. | `lang_` | Language of the parent vocabulary. ~~str~~ | | `prob` | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). ~~float~~ | | `cluster` | Brown cluster ID. ~~int~~ | -| `sentiment` | A scalar value indicating the positivity or negativity of the lexeme. ~~float~~ | diff --git a/website/docs/api/span.md b/website/docs/api/span.md index be522c31f..9bca0c410 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -565,5 +565,4 @@ overlaps with will be returned. | `ent_id_` | Alias for `id_`: the span's ID. ~~str~~ | | `id` | The hash value of the span's ID. ~~int~~ | | `id_` | The span's ID. ~~str~~ | -| `sentiment` | A scalar value indicating the positivity or negativity of the span. ~~float~~ | | `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ | diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 73447e4d3..6c35d47b1 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -470,7 +470,6 @@ The L2 norm of the token's vector representation. | `lang_` | Language of the parent document's vocabulary. ~~str~~ | | `prob` | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~ | | `idx` | The character offset of the token within the parent document. ~~int~~ | -| `sentiment` | A scalar value indicating the positivity or negativity of the token. ~~float~~ | | `lex_id` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ | | `rank` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ | | `cluster` | Brown cluster ID. ~~int~~ | diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 2463b523f..67c88700d 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -1400,7 +1400,7 @@ separation and makes it easier to ensure backwards compatibility. For example, if you've implemented your own `.coref` property and spaCy claims it one day, it'll break your code. Similarly, just by looking at the code, you'll immediately know what's built-in and what's custom – for example, -`doc.sentiment` is spaCy, while `doc._.sent_score` isn't. +`doc.lang` is spaCy, while `doc._.language` isn't. diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index 8e55d54d6..77461c495 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -776,6 +776,9 @@ whitespace, making them easy to match as well. ### {executable="true"} from spacy.lang.en import English from spacy.matcher import Matcher +from spacy.tokens import Doc + +Doc.set_extension("sentiment", default=0.0) nlp = English() # We only want the tokenizer, so no need to load a pipeline matcher = Matcher(nlp.vocab) @@ -791,9 +794,9 @@ neg_patterns = [[{"ORTH": emoji}] for emoji in neg_emoji] def label_sentiment(matcher, doc, i, matches): match_id, start, end = matches[i] if doc.vocab.strings[match_id] == "HAPPY": # Don't forget to get string! - doc.sentiment += 0.1 # Add 0.1 for positive sentiment + doc._.sentiment += 0.1 # Add 0.1 for positive sentiment elif doc.vocab.strings[match_id] == "SAD": - doc.sentiment -= 0.1 # Subtract 0.1 for negative sentiment + doc._.sentiment -= 0.1 # Subtract 0.1 for negative sentiment matcher.add("HAPPY", pos_patterns, on_match=label_sentiment) # Add positive pattern matcher.add("SAD", neg_patterns, on_match=label_sentiment) # Add negative pattern @@ -823,16 +826,17 @@ the emoji span will make it available as `span._.emoji_desc`. ```python from emojipedia import Emojipedia # Installation: pip install emojipedia -from spacy.tokens import Span # Get the global Span object +from spacy.tokens import Doc, Span # Get the global Doc and Span object Span.set_extension("emoji_desc", default=None) # Register the custom attribute +Doc.set_extension("sentiment", default=0.0) def label_sentiment(matcher, doc, i, matches): match_id, start, end = matches[i] if doc.vocab.strings[match_id] == "HAPPY": # Don't forget to get string! - doc.sentiment += 0.1 # Add 0.1 for positive sentiment + doc._.sentiment += 0.1 # Add 0.1 for positive sentiment elif doc.vocab.strings[match_id] == "SAD": - doc.sentiment -= 0.1 # Subtract 0.1 for negative sentiment + doc._.sentiment -= 0.1 # Subtract 0.1 for negative sentiment span = doc[start:end] emoji = Emojipedia.search(span[0].text) # Get data for emoji span._.emoji_desc = emoji.title # Assign emoji description