mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 18:56:36 +03:00
Merge branch 'master' of https://github.com/explosion/spaCy
This commit is contained in:
commit
4a7d524efb
|
@ -112,6 +112,14 @@ cdef class Lexeme:
|
||||||
`Span`, `Token` and `Lexeme` objects.
|
`Span`, `Token` and `Lexeme` objects.
|
||||||
RETURNS (float): A scalar similarity score. Higher is more similar.
|
RETURNS (float): A scalar similarity score. Higher is more similar.
|
||||||
"""
|
"""
|
||||||
|
# Return 1.0 similarity for matches
|
||||||
|
if hasattr(other, 'orth'):
|
||||||
|
if self.c.orth == other.orth:
|
||||||
|
return 1.0
|
||||||
|
elif hasattr(other, '__len__') and len(other) == 1 \
|
||||||
|
and hasattr(other[0], 'orth'):
|
||||||
|
if self.c.orth == other[0].orth:
|
||||||
|
return 1.0
|
||||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||||
return 0.0
|
return 0.0
|
||||||
return (numpy.dot(self.vector, other.vector) /
|
return (numpy.dot(self.vector, other.vector) /
|
||||||
|
|
|
@ -217,6 +217,16 @@ def test_doc_api_has_vector():
|
||||||
doc = Doc(vocab, words=['kitten'])
|
doc = Doc(vocab, words=['kitten'])
|
||||||
assert doc.has_vector
|
assert doc.has_vector
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_api_similarity_match():
|
||||||
|
doc = Doc(Vocab(), words=['a'])
|
||||||
|
assert doc.similarity(doc[0]) == 1.0
|
||||||
|
assert doc.similarity(doc.vocab['a']) == 1.0
|
||||||
|
doc2 = Doc(doc.vocab, words=['a', 'b', 'c'])
|
||||||
|
assert doc.similarity(doc2[:1]) == 1.0
|
||||||
|
assert doc.similarity(doc2) == 0.0
|
||||||
|
|
||||||
|
|
||||||
def test_lowest_common_ancestor(en_tokenizer):
|
def test_lowest_common_ancestor(en_tokenizer):
|
||||||
tokens = en_tokenizer('the lazy dog slept')
|
tokens = en_tokenizer('the lazy dog slept')
|
||||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[2, 1, 1, 0])
|
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[2, 1, 1, 0])
|
||||||
|
@ -225,6 +235,7 @@ def test_lowest_common_ancestor(en_tokenizer):
|
||||||
assert(lca[0, 1] == 2)
|
assert(lca[0, 1] == 2)
|
||||||
assert(lca[1, 2] == 2)
|
assert(lca[1, 2] == 2)
|
||||||
|
|
||||||
|
|
||||||
def test_parse_tree(en_tokenizer):
|
def test_parse_tree(en_tokenizer):
|
||||||
"""Tests doc.print_tree() method."""
|
"""Tests doc.print_tree() method."""
|
||||||
text = 'I like New York in Autumn.'
|
text = 'I like New York in Autumn.'
|
||||||
|
|
|
@ -3,6 +3,8 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..util import get_doc
|
from ..util import get_doc
|
||||||
from ...attrs import ORTH, LENGTH
|
from ...attrs import ORTH, LENGTH
|
||||||
|
from ...tokens import Doc
|
||||||
|
from ...vocab import Vocab
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
@ -66,6 +68,15 @@ def test_spans_lca_matrix(en_tokenizer):
|
||||||
assert(lca[1, 1] == 1)
|
assert(lca[1, 1] == 1)
|
||||||
|
|
||||||
|
|
||||||
|
def test_span_similarity_match():
|
||||||
|
doc = Doc(Vocab(), words=['a', 'b', 'a', 'b'])
|
||||||
|
span1 = doc[:2]
|
||||||
|
span2 = doc[2:]
|
||||||
|
assert span1.similarity(span2) == 1.0
|
||||||
|
assert span1.similarity(doc) == 0.0
|
||||||
|
assert span1[:1].similarity(doc.vocab['a']) == 1.0
|
||||||
|
|
||||||
|
|
||||||
def test_spans_default_sentiment(en_tokenizer):
|
def test_spans_default_sentiment(en_tokenizer):
|
||||||
"""Test span.sentiment property's default averaging behaviour"""
|
"""Test span.sentiment property's default averaging behaviour"""
|
||||||
text = "good stuff bad stuff"
|
text = "good stuff bad stuff"
|
||||||
|
|
|
@ -295,6 +295,17 @@ cdef class Doc:
|
||||||
"""
|
"""
|
||||||
if 'similarity' in self.user_hooks:
|
if 'similarity' in self.user_hooks:
|
||||||
return self.user_hooks['similarity'](self, other)
|
return self.user_hooks['similarity'](self, other)
|
||||||
|
if isinstance(other, (Lexeme, Token)) and self.length == 1:
|
||||||
|
if self.c[0].lex.orth == other.orth:
|
||||||
|
return 1.0
|
||||||
|
elif isinstance(other, (Span, Doc)):
|
||||||
|
if len(self) == len(other):
|
||||||
|
for i in range(self.length):
|
||||||
|
if self[i].orth != other[i].orth:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
return 1.0
|
||||||
|
|
||||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||||
return 0.0
|
return 0.0
|
||||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
|
|
@ -184,6 +184,15 @@ cdef class Span:
|
||||||
"""
|
"""
|
||||||
if 'similarity' in self.doc.user_span_hooks:
|
if 'similarity' in self.doc.user_span_hooks:
|
||||||
self.doc.user_span_hooks['similarity'](self, other)
|
self.doc.user_span_hooks['similarity'](self, other)
|
||||||
|
if len(self) == 1 and hasattr(other, 'orth'):
|
||||||
|
if self[0].orth == other.orth:
|
||||||
|
return 1.0
|
||||||
|
elif hasattr(other, '__len__') and len(self) == len(other):
|
||||||
|
for i in range(len(self)):
|
||||||
|
if self[i].orth != getattr(other[i], 'orth', None):
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
return 1.0
|
||||||
if self.vector_norm == 0.0 or other.vector_norm == 0.0:
|
if self.vector_norm == 0.0 or other.vector_norm == 0.0:
|
||||||
return 0.0
|
return 0.0
|
||||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
|
|
@ -149,6 +149,12 @@ cdef class Token:
|
||||||
"""
|
"""
|
||||||
if 'similarity' in self.doc.user_token_hooks:
|
if 'similarity' in self.doc.user_token_hooks:
|
||||||
return self.doc.user_token_hooks['similarity'](self)
|
return self.doc.user_token_hooks['similarity'](self)
|
||||||
|
if hasattr(other, '__len__') and len(other) == 1:
|
||||||
|
if self.c.lex.orth == getattr(other[0], 'orth', None):
|
||||||
|
return 1.0
|
||||||
|
elif hasattr(other, 'orth'):
|
||||||
|
if self.c.lex.orth == other.orth:
|
||||||
|
return 1.0
|
||||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||||
return 0.0
|
return 0.0
|
||||||
return (numpy.dot(self.vector, other.vector) /
|
return (numpy.dot(self.vector, other.vector) /
|
||||||
|
|
|
@ -48,9 +48,9 @@ p
|
||||||
| those IDs back to strings.
|
| those IDs back to strings.
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
moby_dick = open('moby_dick.txt', 'r') # open a large document
|
text = open('customer_feedback_627.txt', 'r').read() # open a document
|
||||||
doc = nlp(moby_dick) # process it
|
doc = nlp(text) # process it
|
||||||
doc.to_disk('/moby_dick.bin') # save the processed Doc
|
doc.to_disk('/customer_feedback_627.bin') # save the processed Doc
|
||||||
|
|
||||||
p
|
p
|
||||||
| If you need it again later, you can load it back into an empty #[code Doc]
|
| If you need it again later, you can load it back into an empty #[code Doc]
|
||||||
|
@ -61,4 +61,4 @@ p
|
||||||
from spacy.tokens import Doc # to create empty Doc
|
from spacy.tokens import Doc # to create empty Doc
|
||||||
from spacy.vocab import Vocab # to create empty Vocab
|
from spacy.vocab import Vocab # to create empty Vocab
|
||||||
|
|
||||||
doc = Doc(Vocab()).from_disk('/moby_dick.bin') # load processed Doc
|
doc = Doc(Vocab()).from_disk('/customer_feedback_627.bin') # load processed Doc
|
||||||
|
|
|
@ -37,6 +37,9 @@ include ../_includes/_mixins
|
||||||
+card("spacy-api-docker", "https://github.com/jgontrum/spacy-api-docker", "Johannes Gontrum", "github")
|
+card("spacy-api-docker", "https://github.com/jgontrum/spacy-api-docker", "Johannes Gontrum", "github")
|
||||||
| spaCy accessed by a REST API, wrapped in a Docker container.
|
| spaCy accessed by a REST API, wrapped in a Docker container.
|
||||||
|
|
||||||
|
+card("languagecrunch", "https://github.com/artpar/languagecrunch", "Parth Mudgal", "github")
|
||||||
|
| NLP server for spaCy, WordNet and NeuralCoref as a Docker image.
|
||||||
|
|
||||||
+card("spacy-nlp-zeromq", "https://github.com/pasupulaphani/spacy-nlp-docker", "Phaninder Pasupula", "github")
|
+card("spacy-nlp-zeromq", "https://github.com/pasupulaphani/spacy-nlp-docker", "Phaninder Pasupula", "github")
|
||||||
| Docker image exposing spaCy with ZeroMQ bindings.
|
| Docker image exposing spaCy with ZeroMQ bindings.
|
||||||
|
|
||||||
|
@ -69,6 +72,10 @@ include ../_includes/_mixins
|
||||||
| Add language detection to your spaCy pipeline using Compact
|
| Add language detection to your spaCy pipeline using Compact
|
||||||
| Language Detector 2 via PYCLD2.
|
| Language Detector 2 via PYCLD2.
|
||||||
|
|
||||||
|
+card("spacy-lookup", "https://github.com/mpuig/spacy-lookup", "Marc Puig", "github")
|
||||||
|
| A powerful entity matcher for very large dictionaries, using the
|
||||||
|
| FlashText module.
|
||||||
|
|
||||||
.u-text-right
|
.u-text-right
|
||||||
+button("https://github.com/topics/spacy-extension?o=desc&s=stars", false, "primary", "small") See more extensions on GitHub
|
+button("https://github.com/topics/spacy-extension?o=desc&s=stars", false, "primary", "small") See more extensions on GitHub
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user