From d6d95674c15d36afa12b819217a722a3c14a7353 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Sun, 29 Mar 2020 13:56:07 +0200 Subject: [PATCH] bugfix in span similarity (#5155) * bugfix in span similarity * also rewrite doc.pyx for clarity * formatting --- spacy/tests/regression/test_issue5152.py | 18 ++++++++++++++++++ spacy/tokens/doc.pyx | 15 ++++++++------- spacy/tokens/span.pyx | 6 ++++-- 3 files changed, 30 insertions(+), 9 deletions(-) create mode 100644 spacy/tests/regression/test_issue5152.py diff --git a/spacy/tests/regression/test_issue5152.py b/spacy/tests/regression/test_issue5152.py new file mode 100644 index 000000000..a9a57746d --- /dev/null +++ b/spacy/tests/regression/test_issue5152.py @@ -0,0 +1,18 @@ +from spacy.lang.en import English + + +def test_issue5152(): + # Test that the comparison between a Span and a Token, goes well + # There was a bug when the number of tokens in the span equaled the number of characters in the token (!) + nlp = English() + text = nlp("Talk about being boring!") + text_var = nlp("Talk of being boring!") + y = nlp("Let") + + span = text[0:3] # Talk about being + span_2 = text[0:3] # Talk about being + span_3 = text_var[0:3] # Talk of being + token = y[0] # Let + assert span.similarity(token) == 0.0 + assert span.similarity(span_2) == 1.0 + assert span_2.similarity(span_3) < 1.0 diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index a6b1b171b..0716b2b3d 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -380,13 +380,14 @@ cdef class Doc: if isinstance(other, (Lexeme, Token)) and self.length == 1: if self.c[0].lex.orth == other.orth: return 1.0 - elif isinstance(other, (Span, Doc)): - if len(self) == len(other): - for i in range(self.length): - if self[i].orth != other[i].orth: - break - else: - return 1.0 + elif isinstance(other, (Span, Doc)) and len(self) == len(other): + similar = True + for i in range(self.length): + if self[i].orth != other[i].orth: + similar = False + break + if similar: + return 1.0 if self.vocab.vectors.n_keys == 0: warnings.warn(Warnings.W007.format(obj="Doc")) if self.vector_norm == 0 or other.vector_norm == 0: diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 53d1b9826..66e8d8c3e 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -320,11 +320,13 @@ cdef class Span: if len(self) == 1 and hasattr(other, "orth"): if self[0].orth == other.orth: return 1.0 - elif hasattr(other, "__len__") and len(self) == len(other): + elif isinstance(other, (Doc, Span)) and len(self) == len(other): + similar = True for i in range(len(self)): if self[i].orth != getattr(other[i], "orth", None): + similar = False break - else: + if similar: return 1.0 if self.vocab.vectors.n_keys == 0: warnings.warn(Warnings.W007.format(obj="Span"))