mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-13 17:52:31 +03:00
Remove sentiment extension (#11722)
* remove sentiment attribute * remove sentiment from docs * add test for backwards compatibility * replace from_disk with from_bytes * Fix docs and format file * Fix formatting
This commit is contained in:
parent
d0fc871a1c
commit
e79910d57e
|
@ -20,7 +20,6 @@ class Lexeme:
|
||||||
def vector_norm(self) -> float: ...
|
def vector_norm(self) -> float: ...
|
||||||
vector: Floats1d
|
vector: Floats1d
|
||||||
rank: int
|
rank: int
|
||||||
sentiment: float
|
|
||||||
@property
|
@property
|
||||||
def orth_(self) -> str: ...
|
def orth_(self) -> str: ...
|
||||||
@property
|
@property
|
||||||
|
|
|
@ -173,19 +173,6 @@ cdef class Lexeme:
|
||||||
def __set__(self, value):
|
def __set__(self, value):
|
||||||
self.c.id = value
|
self.c.id = value
|
||||||
|
|
||||||
property sentiment:
|
|
||||||
"""RETURNS (float): A scalar value indicating the positivity or
|
|
||||||
negativity of the lexeme."""
|
|
||||||
def __get__(self):
|
|
||||||
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {})
|
|
||||||
return sentiment_table.get(self.c.orth, 0.0)
|
|
||||||
|
|
||||||
def __set__(self, float x):
|
|
||||||
if "lexeme_sentiment" not in self.vocab.lookups:
|
|
||||||
self.vocab.lookups.add_table("lexeme_sentiment")
|
|
||||||
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment")
|
|
||||||
sentiment_table[self.c.orth] = x
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def orth_(self):
|
def orth_(self):
|
||||||
"""RETURNS (str): The original verbatim text of the lexeme
|
"""RETURNS (str): The original verbatim text of the lexeme
|
||||||
|
|
|
@ -40,7 +40,7 @@ py.test spacy/tests/tokenizer/test_exceptions.py::test_tokenizer_handles_emoji #
|
||||||
|
|
||||||
To keep the behavior of the tests consistent and predictable, we try to follow a few basic conventions:
|
To keep the behavior of the tests consistent and predictable, we try to follow a few basic conventions:
|
||||||
|
|
||||||
- **Test names** should follow a pattern of `test_[module]_[tested behaviour]`. For example: `test_tokenizer_keeps_email` or `test_spans_override_sentiment`.
|
- **Test names** should follow a pattern of `test_[module]_[tested behaviour]`. For example: `test_tokenizer_keeps_email`.
|
||||||
- If you're testing for a bug reported in a specific issue, always create a **regression test**. Regression tests should be named `test_issue[ISSUE NUMBER]` and live in the [`regression`](regression) directory.
|
- If you're testing for a bug reported in a specific issue, always create a **regression test**. Regression tests should be named `test_issue[ISSUE NUMBER]` and live in the [`regression`](regression) directory.
|
||||||
- Only use `@pytest.mark.xfail` for tests that **should pass, but currently fail**. To test for desired negative behavior, use `assert not` in your test.
|
- Only use `@pytest.mark.xfail` for tests that **should pass, but currently fail**. To test for desired negative behavior, use `assert not` in your test.
|
||||||
- Very **extensive tests** that take a long time to run should be marked with `@pytest.mark.slow`. If your slow test is testing important behavior, consider adding an additional simpler version.
|
- Very **extensive tests** that take a long time to run should be marked with `@pytest.mark.slow`. If your slow test is testing important behavior, consider adding an additional simpler version.
|
||||||
|
|
|
@ -380,9 +380,7 @@ def test_doc_api_serialize(en_tokenizer, text):
|
||||||
assert [t.text for t in tokens] == [t.text for t in new_tokens]
|
assert [t.text for t in tokens] == [t.text for t in new_tokens]
|
||||||
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
|
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
|
||||||
|
|
||||||
new_tokens = Doc(tokens.vocab).from_bytes(
|
new_tokens = Doc(tokens.vocab).from_bytes(tokens.to_bytes())
|
||||||
tokens.to_bytes(exclude=["sentiment"]), exclude=["sentiment"]
|
|
||||||
)
|
|
||||||
assert tokens.text == new_tokens.text
|
assert tokens.text == new_tokens.text
|
||||||
assert [t.text for t in tokens] == [t.text for t in new_tokens]
|
assert [t.text for t in tokens] == [t.text for t in new_tokens]
|
||||||
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
|
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
|
||||||
|
@ -990,3 +988,12 @@ def test_doc_spans_setdefault(en_tokenizer):
|
||||||
assert len(doc.spans["key2"]) == 1
|
assert len(doc.spans["key2"]) == 1
|
||||||
doc.spans.setdefault("key3", default=SpanGroup(doc, spans=[doc[0:1], doc[1:2]]))
|
doc.spans.setdefault("key3", default=SpanGroup(doc, spans=[doc[0:1], doc[1:2]]))
|
||||||
assert len(doc.spans["key3"]) == 2
|
assert len(doc.spans["key3"]) == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_sentiment_from_bytes_v3_to_v4():
|
||||||
|
"""Test if a doc with sentiment attribute created in v3.x works with '.from_bytes' in v4.x without throwing errors. The sentiment attribute was removed in v4"""
|
||||||
|
doc_bytes = b"\x89\xa4text\xa5happy\xaaarray_head\x9fGQACKOLMN\xcd\x01\xc4\xcd\x01\xc6I\xcd\x01\xc5JP\xaaarray_body\x85\xc4\x02nd\xc3\xc4\x04type\xa3<u8\xc4\x04kind\xc4\x00\xc4\x05shape\x92\x01\x0f\xc4\x04data\xc4x\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xa4\x9a\xd3\x17\xca\xf0b\x03\xa4\x9a\xd3\x17\xca\xf0b\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\xa9sentiment\xcb?\xf0\x00\x00\x00\x00\x00\x00\xa6tensor\x85\xc4\x02nd\xc3\xc4\x04type\xa3<f4\xc4\x04kind\xc4\x00\xc4\x05shape\x91\x00\xc4\x04data\xc4\x00\xa4cats\x80\xa5spans\xc4\x01\x90\xa7strings\x92\xa0\xa5happy\xb2has_unknown_spaces\xc2"
|
||||||
|
doc = Doc(Vocab()).from_bytes(doc_bytes)
|
||||||
|
assert doc.text == "happy"
|
||||||
|
with pytest.raises(AttributeError):
|
||||||
|
doc.sentiment == 1.0
|
||||||
|
|
|
@ -305,31 +305,6 @@ def test_span_similarity_match():
|
||||||
assert span1[:1].similarity(doc.vocab["a"]) == 1.0
|
assert span1[:1].similarity(doc.vocab["a"]) == 1.0
|
||||||
|
|
||||||
|
|
||||||
def test_spans_default_sentiment(en_tokenizer):
|
|
||||||
"""Test span.sentiment property's default averaging behaviour"""
|
|
||||||
text = "good stuff bad stuff"
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
tokens.vocab[tokens[0].text].sentiment = 3.0
|
|
||||||
tokens.vocab[tokens[2].text].sentiment = -2.0
|
|
||||||
doc = Doc(tokens.vocab, words=[t.text for t in tokens])
|
|
||||||
assert doc[:2].sentiment == 3.0 / 2
|
|
||||||
assert doc[-2:].sentiment == -2.0 / 2
|
|
||||||
assert doc[:-1].sentiment == (3.0 + -2) / 3.0
|
|
||||||
|
|
||||||
|
|
||||||
def test_spans_override_sentiment(en_tokenizer):
|
|
||||||
"""Test span.sentiment property's default averaging behaviour"""
|
|
||||||
text = "good stuff bad stuff"
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
tokens.vocab[tokens[0].text].sentiment = 3.0
|
|
||||||
tokens.vocab[tokens[2].text].sentiment = -2.0
|
|
||||||
doc = Doc(tokens.vocab, words=[t.text for t in tokens])
|
|
||||||
doc.user_span_hooks["sentiment"] = lambda span: 10.0
|
|
||||||
assert doc[:2].sentiment == 10.0
|
|
||||||
assert doc[-2:].sentiment == 10.0
|
|
||||||
assert doc[:-1].sentiment == 10.0
|
|
||||||
|
|
||||||
|
|
||||||
def test_spans_are_hashable(en_tokenizer):
|
def test_spans_are_hashable(en_tokenizer):
|
||||||
"""Test spans can be hashed."""
|
"""Test spans can be hashed."""
|
||||||
text = "good stuff bad stuff"
|
text = "good stuff bad stuff"
|
||||||
|
|
|
@ -50,8 +50,6 @@ def test_matcher_from_usage_docs(en_vocab):
|
||||||
|
|
||||||
def label_sentiment(matcher, doc, i, matches):
|
def label_sentiment(matcher, doc, i, matches):
|
||||||
match_id, start, end = matches[i]
|
match_id, start, end = matches[i]
|
||||||
if doc.vocab.strings[match_id] == "HAPPY":
|
|
||||||
doc.sentiment += 0.1
|
|
||||||
span = doc[start:end]
|
span = doc[start:end]
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
retokenizer.merge(span)
|
retokenizer.merge(span)
|
||||||
|
@ -61,7 +59,6 @@ def test_matcher_from_usage_docs(en_vocab):
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
matcher.add("HAPPY", pos_patterns, on_match=label_sentiment)
|
matcher.add("HAPPY", pos_patterns, on_match=label_sentiment)
|
||||||
matcher(doc)
|
matcher(doc)
|
||||||
assert doc.sentiment != 0
|
|
||||||
assert doc[1].norm_ == "happy emoji"
|
assert doc[1].norm_ == "happy emoji"
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -48,8 +48,6 @@ cdef class Doc:
|
||||||
|
|
||||||
cdef TokenC* c
|
cdef TokenC* c
|
||||||
|
|
||||||
cdef public float sentiment
|
|
||||||
|
|
||||||
cdef public dict activations
|
cdef public dict activations
|
||||||
|
|
||||||
cdef public dict user_hooks
|
cdef public dict user_hooks
|
||||||
|
|
|
@ -21,7 +21,6 @@ class Doc:
|
||||||
spans: SpanGroups
|
spans: SpanGroups
|
||||||
max_length: int
|
max_length: int
|
||||||
length: int
|
length: int
|
||||||
sentiment: float
|
|
||||||
activations: Dict[str, Dict[str, Union[ArrayXd, Ragged]]]
|
activations: Dict[str, Dict[str, Union[ArrayXd, Ragged]]]
|
||||||
cats: Dict[str, float]
|
cats: Dict[str, float]
|
||||||
user_hooks: Dict[str, Callable[..., Any]]
|
user_hooks: Dict[str, Callable[..., Any]]
|
||||||
|
|
|
@ -243,7 +243,6 @@ cdef class Doc:
|
||||||
self.c = data_start + PADDING
|
self.c = data_start + PADDING
|
||||||
self.max_length = size
|
self.max_length = size
|
||||||
self.length = 0
|
self.length = 0
|
||||||
self.sentiment = 0.0
|
|
||||||
self.cats = {}
|
self.cats = {}
|
||||||
self.activations = {}
|
self.activations = {}
|
||||||
self.user_hooks = {}
|
self.user_hooks = {}
|
||||||
|
@ -1270,7 +1269,6 @@ cdef class Doc:
|
||||||
other.tensor = copy.deepcopy(self.tensor)
|
other.tensor = copy.deepcopy(self.tensor)
|
||||||
other.cats = copy.deepcopy(self.cats)
|
other.cats = copy.deepcopy(self.cats)
|
||||||
other.user_data = copy.deepcopy(self.user_data)
|
other.user_data = copy.deepcopy(self.user_data)
|
||||||
other.sentiment = self.sentiment
|
|
||||||
other.has_unknown_spaces = self.has_unknown_spaces
|
other.has_unknown_spaces = self.has_unknown_spaces
|
||||||
other.user_hooks = dict(self.user_hooks)
|
other.user_hooks = dict(self.user_hooks)
|
||||||
other.user_token_hooks = dict(self.user_token_hooks)
|
other.user_token_hooks = dict(self.user_token_hooks)
|
||||||
|
@ -1367,7 +1365,6 @@ cdef class Doc:
|
||||||
"text": lambda: self.text,
|
"text": lambda: self.text,
|
||||||
"array_head": lambda: array_head,
|
"array_head": lambda: array_head,
|
||||||
"array_body": lambda: self.to_array(array_head),
|
"array_body": lambda: self.to_array(array_head),
|
||||||
"sentiment": lambda: self.sentiment,
|
|
||||||
"tensor": lambda: self.tensor,
|
"tensor": lambda: self.tensor,
|
||||||
"cats": lambda: self.cats,
|
"cats": lambda: self.cats,
|
||||||
"spans": lambda: self.spans.to_bytes(),
|
"spans": lambda: self.spans.to_bytes(),
|
||||||
|
@ -1405,8 +1402,6 @@ cdef class Doc:
|
||||||
for key, value in zip(user_data_keys, user_data_values):
|
for key, value in zip(user_data_keys, user_data_values):
|
||||||
self.user_data[key] = value
|
self.user_data[key] = value
|
||||||
cdef int i, start, end, has_space
|
cdef int i, start, end, has_space
|
||||||
if "sentiment" not in exclude and "sentiment" in msg:
|
|
||||||
self.sentiment = msg["sentiment"]
|
|
||||||
if "tensor" not in exclude and "tensor" in msg:
|
if "tensor" not in exclude and "tensor" in msg:
|
||||||
self.tensor = msg["tensor"]
|
self.tensor = msg["tensor"]
|
||||||
if "cats" not in exclude and "cats" in msg:
|
if "cats" not in exclude and "cats" in msg:
|
||||||
|
|
|
@ -82,8 +82,6 @@ class Span:
|
||||||
@property
|
@property
|
||||||
def tensor(self) -> FloatsXd: ...
|
def tensor(self) -> FloatsXd: ...
|
||||||
@property
|
@property
|
||||||
def sentiment(self) -> float: ...
|
|
||||||
@property
|
|
||||||
def text(self) -> str: ...
|
def text(self) -> str: ...
|
||||||
@property
|
@property
|
||||||
def text_with_ws(self) -> str: ...
|
def text_with_ws(self) -> str: ...
|
||||||
|
|
|
@ -566,16 +566,6 @@ cdef class Span:
|
||||||
return None
|
return None
|
||||||
return self.doc.tensor[self.start : self.end]
|
return self.doc.tensor[self.start : self.end]
|
||||||
|
|
||||||
@property
|
|
||||||
def sentiment(self):
|
|
||||||
"""RETURNS (float): A scalar value indicating the positivity or
|
|
||||||
negativity of the span.
|
|
||||||
"""
|
|
||||||
if "sentiment" in self.doc.user_span_hooks:
|
|
||||||
return self.doc.user_span_hooks["sentiment"](self)
|
|
||||||
else:
|
|
||||||
return sum([token.sentiment for token in self]) / len(self)
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def text(self):
|
def text(self):
|
||||||
"""RETURNS (str): The original verbatim text of the span."""
|
"""RETURNS (str): The original verbatim text of the span."""
|
||||||
|
|
|
@ -79,8 +79,6 @@ class Token:
|
||||||
@property
|
@property
|
||||||
def prob(self) -> float: ...
|
def prob(self) -> float: ...
|
||||||
@property
|
@property
|
||||||
def sentiment(self) -> float: ...
|
|
||||||
@property
|
|
||||||
def lang(self) -> int: ...
|
def lang(self) -> int: ...
|
||||||
@property
|
@property
|
||||||
def idx(self) -> int: ...
|
def idx(self) -> int: ...
|
||||||
|
|
|
@ -283,14 +283,6 @@ cdef class Token:
|
||||||
"""RETURNS (float): Smoothed log probability estimate of token type."""
|
"""RETURNS (float): Smoothed log probability estimate of token type."""
|
||||||
return self.vocab[self.c.lex.orth].prob
|
return self.vocab[self.c.lex.orth].prob
|
||||||
|
|
||||||
@property
|
|
||||||
def sentiment(self):
|
|
||||||
"""RETURNS (float): A scalar value indicating the positivity or
|
|
||||||
negativity of the token."""
|
|
||||||
if "sentiment" in self.doc.user_token_hooks:
|
|
||||||
return self.doc.user_token_hooks["sentiment"](self)
|
|
||||||
return self.vocab[self.c.lex.orth].sentiment
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def lang(self):
|
def lang(self):
|
||||||
"""RETURNS (uint64): ID of the language of the parent document's
|
"""RETURNS (uint64): ID of the language of the parent document's
|
||||||
|
|
|
@ -761,7 +761,6 @@ The L2 norm of the document's vector representation.
|
||||||
| `user_data` | A generic storage area, for user custom data. ~~Dict[str, Any]~~ |
|
| `user_data` | A generic storage area, for user custom data. ~~Dict[str, Any]~~ |
|
||||||
| `lang` <Tag variant="new">2.1</Tag> | Language of the document's vocabulary. ~~int~~ |
|
| `lang` <Tag variant="new">2.1</Tag> | Language of the document's vocabulary. ~~int~~ |
|
||||||
| `lang_` <Tag variant="new">2.1</Tag> | Language of the document's vocabulary. ~~str~~ |
|
| `lang_` <Tag variant="new">2.1</Tag> | Language of the document's vocabulary. ~~str~~ |
|
||||||
| `sentiment` | The document's positivity/negativity score, if available. ~~float~~ |
|
|
||||||
| `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ |
|
| `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ |
|
||||||
| `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ |
|
| `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ |
|
||||||
| `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ |
|
| `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ |
|
||||||
|
@ -785,7 +784,6 @@ serialization by passing in the string names via the `exclude` argument.
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------ | --------------------------------------------- |
|
| ------------------ | --------------------------------------------- |
|
||||||
| `text` | The value of the `Doc.text` attribute. |
|
| `text` | The value of the `Doc.text` attribute. |
|
||||||
| `sentiment` | The value of the `Doc.sentiment` attribute. |
|
|
||||||
| `tensor` | The value of the `Doc.tensor` attribute. |
|
| `tensor` | The value of the `Doc.tensor` attribute. |
|
||||||
| `user_data` | The value of the `Doc.user_data` dictionary. |
|
| `user_data` | The value of the `Doc.user_data` dictionary. |
|
||||||
| `user_data_keys` | The keys of the `Doc.user_data` dictionary. |
|
| `user_data_keys` | The keys of the `Doc.user_data` dictionary. |
|
||||||
|
|
|
@ -161,4 +161,3 @@ The L2 norm of the lexeme's vector representation.
|
||||||
| `lang_` | Language of the parent vocabulary. ~~str~~ |
|
| `lang_` | Language of the parent vocabulary. ~~str~~ |
|
||||||
| `prob` | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). ~~float~~ |
|
| `prob` | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). ~~float~~ |
|
||||||
| `cluster` | Brown cluster ID. ~~int~~ |
|
| `cluster` | Brown cluster ID. ~~int~~ |
|
||||||
| `sentiment` | A scalar value indicating the positivity or negativity of the lexeme. ~~float~~ |
|
|
||||||
|
|
|
@ -565,5 +565,4 @@ overlaps with will be returned.
|
||||||
| `ent_id_` | Alias for `id_`: the span's ID. ~~str~~ |
|
| `ent_id_` | Alias for `id_`: the span's ID. ~~str~~ |
|
||||||
| `id` | The hash value of the span's ID. ~~int~~ |
|
| `id` | The hash value of the span's ID. ~~int~~ |
|
||||||
| `id_` | The span's ID. ~~str~~ |
|
| `id_` | The span's ID. ~~str~~ |
|
||||||
| `sentiment` | A scalar value indicating the positivity or negativity of the span. ~~float~~ |
|
|
||||||
| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
|
| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
|
||||||
|
|
|
@ -470,7 +470,6 @@ The L2 norm of the token's vector representation.
|
||||||
| `lang_` | Language of the parent document's vocabulary. ~~str~~ |
|
| `lang_` | Language of the parent document's vocabulary. ~~str~~ |
|
||||||
| `prob` | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~ |
|
| `prob` | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~ |
|
||||||
| `idx` | The character offset of the token within the parent document. ~~int~~ |
|
| `idx` | The character offset of the token within the parent document. ~~int~~ |
|
||||||
| `sentiment` | A scalar value indicating the positivity or negativity of the token. ~~float~~ |
|
|
||||||
| `lex_id` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ |
|
| `lex_id` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ |
|
||||||
| `rank` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ |
|
| `rank` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ |
|
||||||
| `cluster` | Brown cluster ID. ~~int~~ |
|
| `cluster` | Brown cluster ID. ~~int~~ |
|
||||||
|
|
|
@ -1400,7 +1400,7 @@ separation and makes it easier to ensure backwards compatibility. For example,
|
||||||
if you've implemented your own `.coref` property and spaCy claims it one day,
|
if you've implemented your own `.coref` property and spaCy claims it one day,
|
||||||
it'll break your code. Similarly, just by looking at the code, you'll
|
it'll break your code. Similarly, just by looking at the code, you'll
|
||||||
immediately know what's built-in and what's custom – for example,
|
immediately know what's built-in and what's custom – for example,
|
||||||
`doc.sentiment` is spaCy, while `doc._.sent_score` isn't.
|
`doc.lang` is spaCy, while `doc._.language` isn't.
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
|
|
|
@ -776,6 +776,9 @@ whitespace, making them easy to match as well.
|
||||||
### {executable="true"}
|
### {executable="true"}
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.matcher import Matcher
|
from spacy.matcher import Matcher
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
Doc.set_extension("sentiment", default=0.0)
|
||||||
|
|
||||||
nlp = English() # We only want the tokenizer, so no need to load a pipeline
|
nlp = English() # We only want the tokenizer, so no need to load a pipeline
|
||||||
matcher = Matcher(nlp.vocab)
|
matcher = Matcher(nlp.vocab)
|
||||||
|
@ -791,9 +794,9 @@ neg_patterns = [[{"ORTH": emoji}] for emoji in neg_emoji]
|
||||||
def label_sentiment(matcher, doc, i, matches):
|
def label_sentiment(matcher, doc, i, matches):
|
||||||
match_id, start, end = matches[i]
|
match_id, start, end = matches[i]
|
||||||
if doc.vocab.strings[match_id] == "HAPPY": # Don't forget to get string!
|
if doc.vocab.strings[match_id] == "HAPPY": # Don't forget to get string!
|
||||||
doc.sentiment += 0.1 # Add 0.1 for positive sentiment
|
doc._.sentiment += 0.1 # Add 0.1 for positive sentiment
|
||||||
elif doc.vocab.strings[match_id] == "SAD":
|
elif doc.vocab.strings[match_id] == "SAD":
|
||||||
doc.sentiment -= 0.1 # Subtract 0.1 for negative sentiment
|
doc._.sentiment -= 0.1 # Subtract 0.1 for negative sentiment
|
||||||
|
|
||||||
matcher.add("HAPPY", pos_patterns, on_match=label_sentiment) # Add positive pattern
|
matcher.add("HAPPY", pos_patterns, on_match=label_sentiment) # Add positive pattern
|
||||||
matcher.add("SAD", neg_patterns, on_match=label_sentiment) # Add negative pattern
|
matcher.add("SAD", neg_patterns, on_match=label_sentiment) # Add negative pattern
|
||||||
|
@ -823,16 +826,17 @@ the emoji span will make it available as `span._.emoji_desc`.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from emojipedia import Emojipedia # Installation: pip install emojipedia
|
from emojipedia import Emojipedia # Installation: pip install emojipedia
|
||||||
from spacy.tokens import Span # Get the global Span object
|
from spacy.tokens import Doc, Span # Get the global Doc and Span object
|
||||||
|
|
||||||
Span.set_extension("emoji_desc", default=None) # Register the custom attribute
|
Span.set_extension("emoji_desc", default=None) # Register the custom attribute
|
||||||
|
Doc.set_extension("sentiment", default=0.0)
|
||||||
|
|
||||||
def label_sentiment(matcher, doc, i, matches):
|
def label_sentiment(matcher, doc, i, matches):
|
||||||
match_id, start, end = matches[i]
|
match_id, start, end = matches[i]
|
||||||
if doc.vocab.strings[match_id] == "HAPPY": # Don't forget to get string!
|
if doc.vocab.strings[match_id] == "HAPPY": # Don't forget to get string!
|
||||||
doc.sentiment += 0.1 # Add 0.1 for positive sentiment
|
doc._.sentiment += 0.1 # Add 0.1 for positive sentiment
|
||||||
elif doc.vocab.strings[match_id] == "SAD":
|
elif doc.vocab.strings[match_id] == "SAD":
|
||||||
doc.sentiment -= 0.1 # Subtract 0.1 for negative sentiment
|
doc._.sentiment -= 0.1 # Subtract 0.1 for negative sentiment
|
||||||
span = doc[start:end]
|
span = doc[start:end]
|
||||||
emoji = Emojipedia.search(span[0].text) # Get data for emoji
|
emoji = Emojipedia.search(span[0].text) # Get data for emoji
|
||||||
span._.emoji_desc = emoji.title # Assign emoji description
|
span._.emoji_desc = emoji.title # Assign emoji description
|
||||||
|
|
Loading…
Reference in New Issue
Block a user