From c26b4b465067eedf78f11ed772b0a430625669b7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 14 Sep 2015 08:17:18 +1000 Subject: [PATCH 1/4] * Fix test_base_nps --- tests/parser/test_base_nps.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/parser/test_base_nps.py b/tests/parser/test_base_nps.py index a5cbbd201..8d308bc8d 100644 --- a/tests/parser/test_base_nps.py +++ b/tests/parser/test_base_nps.py @@ -31,10 +31,10 @@ def test_pp(EN): @pytest.mark.models def test_merge_pp(EN): sent = EN(u'A phrase with another phrase occurs') - nps = [(np[0].idx, np[-1].idx + len(np[-1]), np[0].ent_type_) for np in sent.noun_chunks] + nps = [(np[0].idx, np[-1].idx + len(np[-1]), np.lemma_, np[0].ent_type_) for np in sent.noun_chunks] - for start, end, ent_type in nps: - sent.merge(start, end, u'NP', np.lemma_, ent_type) + for start, end, lemma, ent_type in nps: + sent.merge(start, end, u'NP', lemma, ent_type) assert sent[0].string == 'A phrase ' assert sent[1].string == 'with ' assert sent[2].string == 'another phrase ' From 4ec89788cacfd89b55f516f6d8596bc05c71055c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 14 Sep 2015 17:48:13 +1000 Subject: [PATCH 2/4] * Add tests for new vectors functionality --- tests/vectors/test_vectors.py | 109 ++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 tests/vectors/test_vectors.py diff --git a/tests/vectors/test_vectors.py b/tests/vectors/test_vectors.py new file mode 100644 index 000000000..3c7539e6a --- /dev/null +++ b/tests/vectors/test_vectors.py @@ -0,0 +1,109 @@ +import pytest + +@pytest.mark.models +def test_token_vector(EN): + token = EN(u'Apples and oranges')[0] + token.vector + token.vector_norm + +@pytest.mark.models +def test_lexeme_vector(EN): + lexeme = EN.vocab[u'apples'] + lexeme.vector + lexeme.vector_norm + + +@pytest.mark.models +def test_doc_vector(EN): + doc = EN(u'Apples and oranges') + doc.vector + doc.vector_norm + +@pytest.mark.models +def test_span_vector(EN): + span = EN(u'Apples and oranges')[0:2] + span.vector + span.vector_norm + +@pytest.mark.models +def test_token_token_similarity(EN): + apples, oranges = EN(u'apples oranges') + assert apples.similarity(oranges) == oranges.similarity(apples) + assert 0.0 < apples.similarity(oranges) < 1.0 + + +@pytest.mark.models +def test_token_lexeme_similarity(EN): + apples = EN(u'apples') + oranges = EN.vocab[u'oranges'] + assert apples.similarity(oranges) == oranges.similarity(apples) + assert 0.0 < apples.similarity(oranges) < 1.0 + + +@pytest.mark.models +def test_token_span_similarity(EN): + doc = EN(u'apples orange juice') + apples = doc[0] + oranges = doc[1:3] + assert apples.similarity(oranges) == oranges.similarity(apples) + assert 0.0 < apples.similarity(oranges) < 1.0 + + +@pytest.mark.models +def test_token_doc_similarity(EN): + doc = EN(u'apples orange juice') + apples = doc[0] + assert apples.similarity(doc) == doc.similarity(apples) + assert 0.0 < apples.similarity(doc) < 1.0 + + +@pytest.mark.models +def test_lexeme_span_similarity(EN): + doc = EN(u'apples orange juice') + apples = EN.vocab[u'apples'] + span = doc[1:3] + assert apples.similarity(span) == span.similarity(apples) + assert 0.0 < apples.similarity(span) < 1.0 + + +@pytest.mark.models +def test_lexeme_lexeme_similarity(EN): + apples = EN.vocab[u'apples'] + oranges = EN.vocab[u'oranges'] + assert apples.similarity(oranges) == oranges.similarity(apples) + assert 0.0 < apples.similarity(oranges) < 1.0 + + +@pytest.mark.models +def test_lexeme_doc_similarity(EN): + doc = EN(u'apples orange juice') + apples = EN.vocab[u'apples'] + assert apples.similarity(doc) == doc.similarity(apples) + assert 0.0 < apples.similarity(doc) < 1.0 + + +@pytest.mark.models +def test_span_span_similarity(EN): + doc = EN(u'apples orange juice') + apples = doc[0:2] + oj = doc[1:3] + assert apples.similarity(oj) == oj.similarity(apples) + assert 0.0 < apples.similarity(oj) < 1.0 + + +@pytest.mark.models +def test_span_doc_similarity(EN): + doc = EN(u'apples orange juice') + apples = doc[0:2] + oj = doc[1:3] + assert apples.similarity(doc) == doc.similarity(apples) + assert 0.0 < apples.similarity(doc) < 1.0 + + +@pytest.mark.models +def test_doc_doc_similarity(EN): + apples = EN(u'apples and apple pie') + oranges = EN(u'orange juice') + assert apples.similarity(oranges) == apples.similarity(oranges) + assert 0.0 < apples.similarity(oranges) < 1.0 + From e13e47e9e572a47ae5dca455cdb18c0811979839 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 14 Sep 2015 17:48:51 +1000 Subject: [PATCH 3/4] * Add English stop words --- spacy/en/__init__.py | 32 ++++++++++++++++++++++++++++++++ spacy/language.py | 6 +++++- 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index f68ff196e..4d057db20 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -6,7 +6,39 @@ from ..language import Language LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data') + +# improved list from Stone, Denis, Kwantes (2010) +STOPWORDS = """ +a about above across after afterwards again against all almost alone along already also although always am among amongst amoungst amount an and another any anyhow anyone anything anyway anywhere are around as at back be +became because become becomes becoming been before beforehand behind being below beside besides between beyond bill both bottom but by call can +cannot cant co computer con could couldnt cry de describe +detail did didn do does doesn doing don done down due during +each eg eight either eleven else elsewhere empty enough etc even ever every everyone everything everywhere except few fifteen +fify fill find fire first five for former formerly forty found four from front full further get give go +had has hasnt have he hence her here hereafter hereby herein hereupon hers herself him himself his how however hundred i ie +if in inc indeed interest into is it its itself keep last latter latterly least less ltd +just +kg km +made make many may me meanwhile might mill mine more moreover most mostly move much must my myself name namely +neither never nevertheless next nine no nobody none noone nor not nothing now nowhere of off +often on once one only onto or other others otherwise our ours ourselves out over own part per +perhaps please put rather re +quite +rather really regarding +same say see seem seemed seeming seems serious several she should show side since sincere six sixty so some somehow someone something sometime sometimes somewhere still such system take ten +than that the their them themselves then thence there thereafter thereby therefore therein thereupon these they thick thin third this those though three through throughout thru thus to together too top toward towards twelve twenty two un under +until up unless upon us used using +various very very via +was we well were what whatever when whence whenever where whereafter whereas whereby wherein whereupon wherever whether which while whither who whoever whole whom whose why will with within without would yet you +your yours yourself yourselves +""" +STOPWORDS = set(w for w in STOPWORDS.split() if w) + class English(Language): @classmethod def default_data_dir(cls): return LOCAL_DATA_DIR + + @staticmethod + def is_stop(string): + return 1 if string.lower() in STOPWORDS else 0 diff --git a/spacy/language.py b/spacy/language.py index f32756a4d..c3a938458 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -94,6 +94,10 @@ class Language(object): def like_email(string): return orth.like_email(string) + @staticmethod + def is_stop(string): + return 0 + @classmethod def default_lex_attrs(cls, data_dir=None): return { @@ -116,7 +120,7 @@ class Language(object): attrs.LIKE_URL: cls.like_url, attrs.LIKE_NUM: cls.like_number, attrs.LIKE_EMAIL: cls.like_email, - attrs.IS_STOP: lambda string: False, + attrs.IS_STOP: cls.is_stop, attrs.IS_OOV: lambda string: True } From 65dc0d1dfb4ed6634de61242ecb40343778bfe48 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 14 Sep 2015 17:49:58 +1000 Subject: [PATCH 4/4] * Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility. --- spacy/lexeme.pyx | 27 +++++++++++++++++++++++++++ spacy/tokens/doc.pyx | 18 ++++++++++++++++++ spacy/tokens/spans.pyx | 14 ++++++++++++++ spacy/tokens/token.pyx | 16 +++++++++++++++- 4 files changed, 74 insertions(+), 1 deletion(-) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index e0fa854cb..44c31f834 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -3,6 +3,13 @@ from cpython.ref cimport Py_INCREF from cymem.cymem cimport Pool from murmurhash.mrmr cimport hash64 +# Compiler crashes on memory view coercion without this. Should report bug. +from cython.view cimport array as cvarray +cimport numpy as np +np.import_array() + + + from libc.string cimport memset from .orth cimport word_shape @@ -35,6 +42,26 @@ cdef class Lexeme: def py_check_flag(self, attr_id_t flag_id): return True if Lexeme.check_flag(self.c, flag_id) else False + def similarity(self, other): + return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) + + property vector_norm: + def __get__(self): + return self.c.l2_norm + + def __set__(self, float value): + self.c.l2_norm = value + + property vector: + def __get__(self): + cdef int length = self.vocab.repvec_length + repvec_view = self.c.repvec + return numpy.asarray(repvec_view) + + property repvec: + def __get__(self): + return self.vector + property orth_: def __get__(self): return self.vocab.strings[self.c.orth] diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index f9552b6eb..6878793ab 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -3,7 +3,9 @@ from libc.string cimport memcpy, memset from libc.stdint cimport uint32_t import numpy +import numpy.linalg import struct +cimport numpy as np from ..lexeme cimport Lexeme from ..lexeme cimport EMPTY_LEXEME @@ -118,6 +120,22 @@ cdef class Doc: def __str__(self): return u''.join([t.string for t in self]) + def similarity(self, other): + return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) + + property repvec: + def __get__(self): + return self.vector + + property vector: + def __get__(self): + return sum(t.vector for t in self if not t.is_stop) / len(self) + + + property vector_norm: + def __get__(self): + return numpy.linalg.norm(self.vector) + @property def string(self): return u''.join([t.string for t in self]) diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/spans.pyx index d9e4fbf0e..12ad6e425 100644 --- a/spacy/tokens/spans.pyx +++ b/spacy/tokens/spans.pyx @@ -1,5 +1,8 @@ from __future__ import unicode_literals from collections import defaultdict +import numpy +import numpy.linalg +cimport numpy as np from ..structs cimport TokenC, LexemeC from ..typedefs cimport flags_t, attr_t @@ -52,6 +55,17 @@ cdef class Span: def merge(self, unicode tag, unicode lemma, unicode ent_type): self._seq.merge(self[0].idx, self[-1].idx + len(self[-1]), tag, lemma, ent_type) + def similarity(self, other): + return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) + + property vector: + def __get__(self): + return sum(t.vector for t in self if not t.is_stop) / len(self) + + property vector_norm: + def __get__(self): + return numpy.linalg.norm(self.vector) + property text: def __get__(self): return u' '.join([t.text for t in self]) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index e3e78838f..5b5d84887 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -49,6 +49,9 @@ cdef class Token: def nbor(self, int i=1): return self.doc[self.i+i] + def similarity(self, other): + return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) + property lex_id: def __get__(self): return self.c.lex.id @@ -125,12 +128,20 @@ cdef class Token: def __get__(self): return self.c.dep - property repvec: + property vector: def __get__(self): cdef int length = self.vocab.repvec_length repvec_view = self.c.lex.repvec return numpy.asarray(repvec_view) + property repvec: + def __get__(self): + return self.vector + + property vector_norm: + def __get__(self): + return self.c.lex.l2_norm + property n_lefts: def __get__(self): cdef int n = 0 @@ -302,6 +313,9 @@ cdef class Token: property is_oov: def __get__(self): return Lexeme.check_flag(self.c.lex, IS_OOV) + property is_stop: + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_STOP) + property is_alpha: def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ALPHA)