From f8ac5b9f563050472aedc719950b4888c65ca4cc Mon Sep 17 00:00:00 2001
From: adrianeboyd <adrianeboyd@gmail.com>
Date: Mon, 27 Apr 2020 16:51:27 +0200
Subject: [PATCH] bugfix in span similarity (#5155) (#5358)

* bugfix in span similarity

* also rewrite doc.pyx for clarity

* formatting

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/tests/regression/test_issue5152.py | 18 ++++++++++++++++++
 spacy/tokens/doc.pyx                     | 15 ++++++++-------
 spacy/tokens/span.pyx                    |  6 ++++--
 3 files changed, 30 insertions(+), 9 deletions(-)
 create mode 100644 spacy/tests/regression/test_issue5152.py

diff --git a/spacy/tests/regression/test_issue5152.py b/spacy/tests/regression/test_issue5152.py
new file mode 100644
index 000000000..a9a57746d
--- /dev/null
+++ b/spacy/tests/regression/test_issue5152.py
@@ -0,0 +1,18 @@
+from spacy.lang.en import English
+
+
+def test_issue5152():
+    # Test that the comparison between a Span and a Token, goes well
+    # There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
+    nlp = English()
+    text = nlp("Talk about being boring!")
+    text_var = nlp("Talk of being boring!")
+    y = nlp("Let")
+
+    span = text[0:3]  # Talk about being
+    span_2 = text[0:3]  # Talk about being
+    span_3 = text_var[0:3]  # Talk of being
+    token = y[0]  # Let
+    assert span.similarity(token) == 0.0
+    assert span.similarity(span_2) == 1.0
+    assert span_2.similarity(span_3) < 1.0
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index ec0cd66b8..f27115e6f 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -387,13 +387,14 @@ cdef class Doc:
         if isinstance(other, (Lexeme, Token)) and self.length == 1:
             if self.c[0].lex.orth == other.orth:
                 return 1.0
-        elif isinstance(other, (Span, Doc)):
-            if len(self) == len(other):
-                for i in range(self.length):
-                    if self[i].orth != other[i].orth:
-                        break
-                else:
-                    return 1.0
+        elif isinstance(other, (Span, Doc)) and len(self) == len(other):
+            similar = True
+            for i in range(self.length):
+                if self[i].orth != other[i].orth:
+                    similar = False
+                    break
+            if similar:
+                return 1.0
         if self.vocab.vectors.n_keys == 0:
             models_warning(Warnings.W007.format(obj="Doc"))
         if self.vector_norm == 0 or other.vector_norm == 0:
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 35c70f236..9269700b0 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -324,11 +324,13 @@ cdef class Span:
         if len(self) == 1 and hasattr(other, "orth"):
             if self[0].orth == other.orth:
                 return 1.0
-        elif hasattr(other, "__len__") and len(self) == len(other):
+        elif isinstance(other, (Doc, Span)) and len(self) == len(other):
+            similar = True
             for i in range(len(self)):
                 if self[i].orth != getattr(other[i], "orth", None):
+                    similar = False
                     break
-            else:
+            if similar:
                 return 1.0
         if self.vocab.vectors.n_keys == 0:
             models_warning(Warnings.W007.format(obj="Span"))