From 0bc48e82d0adfd9e42c463b65f3ee1929f4172f5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 15 Oct 2015 02:44:23 +1100 Subject: [PATCH 1/7] * Add _handler to resolve Issue #123 --- examples/_handler.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 examples/_handler.py diff --git a/examples/_handler.py b/examples/_handler.py new file mode 100644 index 000000000..cebfe8968 --- /dev/null +++ b/examples/_handler.py @@ -0,0 +1,37 @@ +# encoding: utf8 +from __future__ import unicode_literals, print_function + +from math import sqrt +from numpy import dot +from numpy.linalg import norm + + +def handle_tweet(spacy, tweet_data, query): + text = tweet_data.get('text', u'') + # Twython returns either bytes or unicode, depending on tweet. + # ಠ_ಠ #APIshaming + try: + match_tweet(spacy, text, query) + except TypeError: + match_tweet(spacy, text.decode('utf8'), query) + + +def match_tweet(spacy, text, query): + def get_vector(word): + return spacy.vocab[word].repvec + + tweet = spacy(text) + tweet = [w.repvec for w in tweet if w.is_alpha and w.lower_ != query] + if tweet: + accept = map(get_vector, 'child classroom teach'.split()) + reject = map(get_vector, 'mouth hands giveaway'.split()) + + y = sum(max(cos(w1, w2), 0) for w1 in tweet for w2 in accept) + n = sum(max(cos(w1, w2), 0) for w1 in tweet for w2 in reject) + + if (y / (y + n)) >= 0.5 or True: + print(text) + + +def cos(v1, v2): + return dot(v1, v2) / (norm(v1) * norm(v2)) From b0c6daf356c58e9ed373542ab2de30082a5e2237 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 15 Oct 2015 03:20:51 +1100 Subject: [PATCH 2/7] * Fix test_space_attachment --- tests/parser/test_space_attachment.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/parser/test_space_attachment.py b/tests/parser/test_space_attachment.py index 73cf22cea..2a5636b95 100644 --- a/tests/parser/test_space_attachment.py +++ b/tests/parser/test_space_attachment.py @@ -7,6 +7,5 @@ def test_space_attachment(EN): sentence = 'This is a test.\nTo ensure spaces are attached well.' doc = EN(sentence) - for word in doc: - if word.is_space: - assert word.head.i == (word.i - 1) + for sent in doc.sents: + assert not sent[-1].is_space From b4cac52f7f5cfe545f8cf2f4d4c72ed791ba52ef Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 15 Oct 2015 03:24:57 +1100 Subject: [PATCH 3/7] * Fix test_space_attachment --- tests/parser/test_space_attachment.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/parser/test_space_attachment.py b/tests/parser/test_space_attachment.py index 2a5636b95..114d324fd 100644 --- a/tests/parser/test_space_attachment.py +++ b/tests/parser/test_space_attachment.py @@ -8,4 +8,5 @@ def test_space_attachment(EN): doc = EN(sentence) for sent in doc.sents: - assert not sent[-1].is_space + if len(sent) == 1: + assert not sent[-1].is_space From 23818f89b83d829d1b6b355a66fd5bc4b0477a20 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 15 Oct 2015 03:34:57 +1100 Subject: [PATCH 4/7] * Fix token.conjuncts method --- spacy/tokens/token.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index af80b5359..140d07251 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -248,12 +248,12 @@ cdef class Token: property conjuncts: def __get__(self): - """Get a list of conjoined words""" + """Get a list of conjoined words.""" cdef Token word conjs = [] if self.c.pos != CONJ and self.c.pos != PUNCT: seen_conj = False - for word in reversed(list(self.lefts)): + for word in self.rights: if word.c.pos == CONJ: seen_conj = True elif seen_conj and word.c.pos == self.c.pos: From b8f3345a82d9626c77bc23e25973b306a4305715 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 15 Oct 2015 03:36:01 +1100 Subject: [PATCH 5/7] * Fix token.conjuncts method --- spacy/tokens/token.pyx | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 140d07251..bb2bbdf89 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -250,7 +250,7 @@ cdef class Token: def __get__(self): """Get a list of conjoined words.""" cdef Token word - conjs = [] + conjs = [self] if self.c.pos != CONJ and self.c.pos != PUNCT: seen_conj = False for word in self.rights: @@ -258,8 +258,6 @@ cdef class Token: seen_conj = True elif seen_conj and word.c.pos == self.c.pos: conjs.append(word) - conjs.reverse() - conjs.append(self) if seen_conj: return conjs elif self is not self.head and self in self.head.conjuncts: From 2e0104ac81a2ccbea745ed19b8dc053a9ef2ff27 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 15 Oct 2015 03:47:45 +1100 Subject: [PATCH 6/7] * Fix token.conjuncts --- spacy/tokens/token.pyx | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index bb2bbdf89..1f672ee35 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -250,20 +250,14 @@ cdef class Token: def __get__(self): """Get a list of conjoined words.""" cdef Token word - conjs = [self] + conjuncts = [] if self.c.pos != CONJ and self.c.pos != PUNCT: - seen_conj = False for word in self.rights: - if word.c.pos == CONJ: - seen_conj = True - elif seen_conj and word.c.pos == self.c.pos: - conjs.append(word) - if seen_conj: - return conjs - elif self is not self.head and self in self.head.conjuncts: - return self.head.conjuncts - else: - return [] + if word.dep_ == 'conj': + yield word + yield from word.conjuncts + conjuncts.append(word) + conjuncts.extend(word.conjuncts) property ent_type: def __get__(self): From 6e0f985afc4cef65386f8c7a193dde9c510ad8f2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 15 Oct 2015 03:49:45 +1100 Subject: [PATCH 7/7] * Fix token.conjuncts --- spacy/tokens/token.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 1f672ee35..dc2abbb25 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -251,7 +251,7 @@ cdef class Token: """Get a list of conjoined words.""" cdef Token word conjuncts = [] - if self.c.pos != CONJ and self.c.pos != PUNCT: + if self.dep_ != 'conj': for word in self.rights: if word.dep_ == 'conj': yield word