bugfix in span similarity (#5155)

* bugfix in span similarity

* also rewrite doc.pyx for clarity

* formatting
This commit is contained in:
Sofie Van Landeghem 2020-03-29 13:56:07 +02:00 committed by GitHub
parent 1f9852abc3
commit d6d95674c1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 30 additions and 9 deletions

View File

@ -0,0 +1,18 @@
from spacy.lang.en import English
def test_issue5152():
# Test that the comparison between a Span and a Token, goes well
# There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
nlp = English()
text = nlp("Talk about being boring!")
text_var = nlp("Talk of being boring!")
y = nlp("Let")
span = text[0:3] # Talk about being
span_2 = text[0:3] # Talk about being
span_3 = text_var[0:3] # Talk of being
token = y[0] # Let
assert span.similarity(token) == 0.0
assert span.similarity(span_2) == 1.0
assert span_2.similarity(span_3) < 1.0

View File

@ -380,13 +380,14 @@ cdef class Doc:
if isinstance(other, (Lexeme, Token)) and self.length == 1: if isinstance(other, (Lexeme, Token)) and self.length == 1:
if self.c[0].lex.orth == other.orth: if self.c[0].lex.orth == other.orth:
return 1.0 return 1.0
elif isinstance(other, (Span, Doc)): elif isinstance(other, (Span, Doc)) and len(self) == len(other):
if len(self) == len(other): similar = True
for i in range(self.length): for i in range(self.length):
if self[i].orth != other[i].orth: if self[i].orth != other[i].orth:
break similar = False
else: break
return 1.0 if similar:
return 1.0
if self.vocab.vectors.n_keys == 0: if self.vocab.vectors.n_keys == 0:
warnings.warn(Warnings.W007.format(obj="Doc")) warnings.warn(Warnings.W007.format(obj="Doc"))
if self.vector_norm == 0 or other.vector_norm == 0: if self.vector_norm == 0 or other.vector_norm == 0:

View File

@ -320,11 +320,13 @@ cdef class Span:
if len(self) == 1 and hasattr(other, "orth"): if len(self) == 1 and hasattr(other, "orth"):
if self[0].orth == other.orth: if self[0].orth == other.orth:
return 1.0 return 1.0
elif hasattr(other, "__len__") and len(self) == len(other): elif isinstance(other, (Doc, Span)) and len(self) == len(other):
similar = True
for i in range(len(self)): for i in range(len(self)):
if self[i].orth != getattr(other[i], "orth", None): if self[i].orth != getattr(other[i], "orth", None):
similar = False
break break
else: if similar:
return 1.0 return 1.0
if self.vocab.vectors.n_keys == 0: if self.vocab.vectors.n_keys == 0:
warnings.warn(Warnings.W007.format(obj="Span")) warnings.warn(Warnings.W007.format(obj="Span"))