diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 8c1b5f34c..d136540f9 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -112,6 +112,14 @@ cdef class Lexeme: `Span`, `Token` and `Lexeme` objects. RETURNS (float): A scalar similarity score. Higher is more similar. """ + # Return 1.0 similarity for matches + if hasattr(other, 'orth'): + if self.c.orth == other.orth: + return 1.0 + elif hasattr(other, '__len__') and len(other) == 1 \ + and hasattr(other[0], 'orth'): + if self.c.orth == other[0].orth: + return 1.0 if self.vector_norm == 0 or other.vector_norm == 0: return 0.0 return (numpy.dot(self.vector, other.vector) / diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index e4d57cbb0..a5f50f2bc 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -217,6 +217,16 @@ def test_doc_api_has_vector(): doc = Doc(vocab, words=['kitten']) assert doc.has_vector + +def test_doc_api_similarity_match(): + doc = Doc(Vocab(), words=['a']) + assert doc.similarity(doc[0]) == 1.0 + assert doc.similarity(doc.vocab['a']) == 1.0 + doc2 = Doc(doc.vocab, words=['a', 'b', 'c']) + assert doc.similarity(doc2[:1]) == 1.0 + assert doc.similarity(doc2) == 0.0 + + def test_lowest_common_ancestor(en_tokenizer): tokens = en_tokenizer('the lazy dog slept') doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[2, 1, 1, 0]) @@ -225,6 +235,7 @@ def test_lowest_common_ancestor(en_tokenizer): assert(lca[0, 1] == 2) assert(lca[1, 2] == 2) + def test_parse_tree(en_tokenizer): """Tests doc.print_tree() method.""" text = 'I like New York in Autumn.' diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index b8638ba4b..8cd4347c2 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -3,6 +3,8 @@ from __future__ import unicode_literals from ..util import get_doc from ...attrs import ORTH, LENGTH +from ...tokens import Doc +from ...vocab import Vocab import pytest @@ -66,6 +68,15 @@ def test_spans_lca_matrix(en_tokenizer): assert(lca[1, 1] == 1) +def test_span_similarity_match(): + doc = Doc(Vocab(), words=['a', 'b', 'a', 'b']) + span1 = doc[:2] + span2 = doc[2:] + assert span1.similarity(span2) == 1.0 + assert span1.similarity(doc) == 0.0 + assert span1[:1].similarity(doc.vocab['a']) == 1.0 + + def test_spans_default_sentiment(en_tokenizer): """Test span.sentiment property's default averaging behaviour""" text = "good stuff bad stuff" diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index df6a17521..12932238f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -295,6 +295,17 @@ cdef class Doc: """ if 'similarity' in self.user_hooks: return self.user_hooks['similarity'](self, other) + if isinstance(other, (Lexeme, Token)) and self.length == 1: + if self.c[0].lex.orth == other.orth: + return 1.0 + elif isinstance(other, (Span, Doc)): + if len(self) == len(other): + for i in range(self.length): + if self[i].orth != other[i].orth: + break + else: + return 1.0 + if self.vector_norm == 0 or other.vector_norm == 0: return 0.0 return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 7c8e0a9ed..10d9660e7 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -184,6 +184,15 @@ cdef class Span: """ if 'similarity' in self.doc.user_span_hooks: self.doc.user_span_hooks['similarity'](self, other) + if len(self) == 1 and hasattr(other, 'orth'): + if self[0].orth == other.orth: + return 1.0 + elif hasattr(other, '__len__') and len(self) == len(other): + for i in range(len(self)): + if self[i].orth != getattr(other[i], 'orth', None): + break + else: + return 1.0 if self.vector_norm == 0.0 or other.vector_norm == 0.0: return 0.0 return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 11a165ab8..74487b515 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -149,6 +149,12 @@ cdef class Token: """ if 'similarity' in self.doc.user_token_hooks: return self.doc.user_token_hooks['similarity'](self) + if hasattr(other, '__len__') and len(other) == 1: + if self.c.lex.orth == getattr(other[0], 'orth', None): + return 1.0 + elif hasattr(other, 'orth'): + if self.c.lex.orth == other.orth: + return 1.0 if self.vector_norm == 0 or other.vector_norm == 0: return 0.0 return (numpy.dot(self.vector, other.vector) / diff --git a/website/usage/_spacy-101/_serialization.jade b/website/usage/_spacy-101/_serialization.jade index 27804344e..eba2d573e 100644 --- a/website/usage/_spacy-101/_serialization.jade +++ b/website/usage/_spacy-101/_serialization.jade @@ -48,9 +48,9 @@ p | those IDs back to strings. +code. - moby_dick = open('moby_dick.txt', 'r') # open a large document - doc = nlp(moby_dick) # process it - doc.to_disk('/moby_dick.bin') # save the processed Doc + text = open('customer_feedback_627.txt', 'r').read() # open a document + doc = nlp(text) # process it + doc.to_disk('/customer_feedback_627.bin') # save the processed Doc p | If you need it again later, you can load it back into an empty #[code Doc] @@ -61,4 +61,4 @@ p from spacy.tokens import Doc # to create empty Doc from spacy.vocab import Vocab # to create empty Vocab - doc = Doc(Vocab()).from_disk('/moby_dick.bin') # load processed Doc + doc = Doc(Vocab()).from_disk('/customer_feedback_627.bin') # load processed Doc diff --git a/website/usage/resources.jade b/website/usage/resources.jade index 536a92cf8..8766d3864 100644 --- a/website/usage/resources.jade +++ b/website/usage/resources.jade @@ -37,6 +37,9 @@ include ../_includes/_mixins +card("spacy-api-docker", "https://github.com/jgontrum/spacy-api-docker", "Johannes Gontrum", "github") | spaCy accessed by a REST API, wrapped in a Docker container. + +card("languagecrunch", "https://github.com/artpar/languagecrunch", "Parth Mudgal", "github") + | NLP server for spaCy, WordNet and NeuralCoref as a Docker image. + +card("spacy-nlp-zeromq", "https://github.com/pasupulaphani/spacy-nlp-docker", "Phaninder Pasupula", "github") | Docker image exposing spaCy with ZeroMQ bindings. @@ -69,6 +72,10 @@ include ../_includes/_mixins | Add language detection to your spaCy pipeline using Compact | Language Detector 2 via PYCLD2. + +card("spacy-lookup", "https://github.com/mpuig/spacy-lookup", "Marc Puig", "github") + | A powerful entity matcher for very large dictionaries, using the + | FlashText module. + .u-text-right +button("https://github.com/topics/spacy-extension?o=desc&s=stars", false, "primary", "small") See more extensions on GitHub