From f8ac5b9f563050472aedc719950b4888c65ca4cc Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 27 Apr 2020 16:51:27 +0200 Subject: [PATCH] bugfix in span similarity (#5155) (#5358) * bugfix in span similarity * also rewrite doc.pyx for clarity * formatting Co-authored-by: Sofie Van Landeghem --- spacy/tests/regression/test_issue5152.py | 18 ++++++++++++++++++ spacy/tokens/doc.pyx | 15 ++++++++------- spacy/tokens/span.pyx | 6 ++++-- 3 files changed, 30 insertions(+), 9 deletions(-) create mode 100644 spacy/tests/regression/test_issue5152.py diff --git a/spacy/tests/regression/test_issue5152.py b/spacy/tests/regression/test_issue5152.py new file mode 100644 index 000000000..a9a57746d --- /dev/null +++ b/spacy/tests/regression/test_issue5152.py @@ -0,0 +1,18 @@ +from spacy.lang.en import English + + +def test_issue5152(): + # Test that the comparison between a Span and a Token, goes well + # There was a bug when the number of tokens in the span equaled the number of characters in the token (!) + nlp = English() + text = nlp("Talk about being boring!") + text_var = nlp("Talk of being boring!") + y = nlp("Let") + + span = text[0:3] # Talk about being + span_2 = text[0:3] # Talk about being + span_3 = text_var[0:3] # Talk of being + token = y[0] # Let + assert span.similarity(token) == 0.0 + assert span.similarity(span_2) == 1.0 + assert span_2.similarity(span_3) < 1.0 diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index ec0cd66b8..f27115e6f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -387,13 +387,14 @@ cdef class Doc: if isinstance(other, (Lexeme, Token)) and self.length == 1: if self.c[0].lex.orth == other.orth: return 1.0 - elif isinstance(other, (Span, Doc)): - if len(self) == len(other): - for i in range(self.length): - if self[i].orth != other[i].orth: - break - else: - return 1.0 + elif isinstance(other, (Span, Doc)) and len(self) == len(other): + similar = True + for i in range(self.length): + if self[i].orth != other[i].orth: + similar = False + break + if similar: + return 1.0 if self.vocab.vectors.n_keys == 0: models_warning(Warnings.W007.format(obj="Doc")) if self.vector_norm == 0 or other.vector_norm == 0: diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 35c70f236..9269700b0 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -324,11 +324,13 @@ cdef class Span: if len(self) == 1 and hasattr(other, "orth"): if self[0].orth == other.orth: return 1.0 - elif hasattr(other, "__len__") and len(self) == len(other): + elif isinstance(other, (Doc, Span)) and len(self) == len(other): + similar = True for i in range(len(self)): if self[i].orth != getattr(other[i], "orth", None): + similar = False break - else: + if similar: return 1.0 if self.vocab.vectors.n_keys == 0: models_warning(Warnings.W007.format(obj="Span"))