Merge remote-tracking branch 'refs/remotes/honnibal/master'

2025-11-01 00:17:44 +03:00 · 2015-10-15 12:10:23 +03:00 · 2015-10-15 12:10:23 +03:00 · 23475360e4
commit 23475360e4
parent d6be51d08f 6e0f985afc
3 changed files with 49 additions and 20 deletions
--- a/examples/_handler.py
+++ b/examples/_handler.py
@ -0,0 +1,37 @@
 # encoding: utf8
 from __future__ import unicode_literals, print_function
 from math import sqrt
 from numpy import dot
 from numpy.linalg import norm
 def handle_tweet(spacy, tweet_data, query):
    text = tweet_data.get('text', u'')
    # Twython returns either bytes or unicode, depending on tweet.
    # ಠ_ಠ #APIshaming
    try:
        match_tweet(spacy, text, query)
    except TypeError:
        match_tweet(spacy, text.decode('utf8'), query)
 def match_tweet(spacy, text, query):
    def get_vector(word):
        return spacy.vocab[word].repvec
    tweet = spacy(text)
    tweet = [w.repvec for w in tweet if w.is_alpha and w.lower_ != query]
    if tweet:
        accept = map(get_vector, 'child classroom teach'.split())
        reject = map(get_vector, 'mouth hands giveaway'.split())
        y = sum(max(cos(w1, w2), 0) for w1 in tweet for w2 in accept)
        n = sum(max(cos(w1, w2), 0) for w1 in tweet for w2 in reject)
        if (y / (y + n)) >= 0.5 or True:
            print(text)
 def cos(v1, v2):
    return dot(v1, v2) / (norm(v1) * norm(v2))
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -248,24 +248,16 @@ cdef class Token:
    property conjuncts:
        def __get__(self):
-            """Get a list of conjoined words"""
+            """Get a list of conjoined words."""
            cdef Token word
-            conjs = []
+            conjuncts = []
-            if self.c.pos != CONJ and self.c.pos != PUNCT:
+            if self.dep_ != 'conj':
-                seen_conj = False
+                for word in self.rights:
-                for word in reversed(list(self.lefts)):
+                    if word.dep_ == 'conj':
-                    if word.c.pos == CONJ:
+                        yield word
-                        seen_conj = True
+                        yield from word.conjuncts
-                    elif seen_conj and word.c.pos == self.c.pos:
+                        conjuncts.append(word)
-                        conjs.append(word)
+                        conjuncts.extend(word.conjuncts)
            conjs.reverse()
            conjs.append(self)
            if seen_conj:
                return conjs
            elif self is not self.head and self in self.head.conjuncts:
                return self.head.conjuncts
            else:
                return []
    property ent_type:
        def __get__(self):
--- a/tests/parser/test_space_attachment.py
+++ b/tests/parser/test_space_attachment.py
@ -7,6 +7,6 @@ def test_space_attachment(EN):
    sentence = 'This is a test.\nTo ensure  spaces are attached well.'
    doc = EN(sentence)
-    for word in doc:
+    for sent in doc.sents:
-        if word.is_space:
+        if len(sent) == 1:
-            assert word.head.i == (word.i - 1)
+            assert not sent[-1].is_space