From 80b36f9f2755715d8e9bf7abffd7d244124bb1b0 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <matthew@honnibal.com>
Date: Mon, 7 Jul 2014 19:12:19 +0200
Subject: [PATCH] * 710k words per second for counts

---
 spacy/lexeme.pxd |  1 +
 spacy/lexeme.pyx |  9 ++++++---
 spacy/spacy.pyx  | 25 ++++++++++++++++++++++++-
 spacy/tokens.pyx |  5 ++---
 4 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index 783329981..041bdcc47 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -41,6 +41,7 @@ cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL)
 
 cpdef StringHash lex_of(size_t lex_id) except 0
 cpdef StringHash norm_of(size_t lex_id) except 0
+cpdef StringHash shape_of(size_t lex_id) except 0
 #cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed,
 #                         int split, size_t length)
                          
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 5d83cfeee..551b88442 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -54,6 +54,10 @@ cpdef StringHash norm_of(size_t lex_id) except 0:
     return (<Lexeme*>lex_id).orth.norm
 
 
+cpdef StringHash shape_of(size_t lex_id) except 0:
+    return (<Lexeme*>lex_id).orth.shape
+
+
 cpdef ClusterID cluster_of(size_t lex_id):
     '''Access the `cluster' field of the Lexeme pointed to by lex_id, which
     gives an integer representation of the cluster ID of the word, 
@@ -94,7 +98,7 @@ cpdef double prob_of(size_t lex_id):
     >>> prob_of(lookup(u'world'))
     -20.10340371976182
     '''
-    pass
+    return (<Lexeme*>lex_id).dist.prob
 
 
 cpdef StringHash last3_of(size_t lex_id):
@@ -105,8 +109,7 @@ cpdef StringHash last3_of(size_t lex_id):
     >>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
     [u'llo', u'!']
     '''
-    cdef Lexeme* w = <Lexeme*>lex_id
-    return w.orth.last3 if w.orth != NULL else 0
+    return (<Lexeme*>lex_id).orth.last3
 
 
 cpdef bint is_oft_upper(size_t lex_id):
diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx
index 4c4480ca1..b942e6f0f 100644
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@@ -25,7 +25,30 @@ def get_normalized(unicode lex, size_t length):
 
 
 def get_word_shape(lex, length):
-    return lex
+    shape = ""
+    last = ""
+    shape_char = ""
+    seq = 0
+    for c in lex:
+        if c.isalpha():
+            if c.isupper():
+                shape_char = "X"
+            else:
+                shape_char = "x"
+        elif c.isdigit():
+            shape_char = "d"
+        else:
+            shape_char = c
+        if shape_char == last:
+            seq += 1
+        else:
+            seq = 0
+            last = shape_char
+        if seq < 3:
+            shape += shape_char
+    assert shape
+    return shape
+
 
 
 def set_orth_flags(lex, length):
diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx
index 61229e500..9d40ceb26 100644
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@@ -3,7 +3,7 @@ from cython.operator cimport preincrement as inc
 
 
 from spacy.lexeme cimport Lexeme
-from spacy.lexeme cimport norm_of
+from spacy.lexeme cimport norm_of, shape_of
 from spacy.spacy cimport StringHash
 
 
@@ -45,9 +45,8 @@ cdef class Tokens:
         cdef Lexeme_addr t
         cdef StringHash key
         for t in self.vctr[0]:
-            key = norm_of(t)
+            key = (<Lexeme*>t).lex
             if key not in counts:
                 counts[key] = 0
             counts[key] += 1
         return counts
-