From 5fddb8d16590b549161e84b256a20e694e090291 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <matthew@honnibal.com>
Date: Tue, 19 Aug 2014 04:21:20 +0200
Subject: [PATCH] * Working refactor, with updated data model for Lexemes

---
 spacy/en.pyx     | 28 ---------------------
 spacy/lexeme.pxd | 20 +++++++--------
 spacy/lexeme.pyx | 45 ++++++++++++++++++++++++++--------
 spacy/spacy.pxd  |  3 ---
 spacy/spacy.pyx  | 64 ++++++++++++++++++++----------------------------
 5 files changed, 72 insertions(+), 88 deletions(-)

diff --git a/spacy/en.pyx b/spacy/en.pyx
index 4357addfe..3b7d506fa 100644
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@@ -66,31 +66,3 @@ cpdef unicode unhash(StringHash hash_value):
     return EN.unhash(hash_value)
 
 
-cpdef bint is_oft_upper(size_t lex_id):
-    '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
-    stores whether the lowered version of the string hashed by `lex' is found
-    in all-upper case frequently in a large sample of text.  Users are free
-    to load different data, by default we use a sample from Wikipedia, with
-    a threshold of 0.95, picked to maximize mutual information for POS tagging.
-
-    >>> is_oft_upper(lookup(u'abc'))
-    True
-    >>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
-    True
-    '''
-    return (<Lexeme*>lex_id).dist.flags & OFT_UPPER
-
-
-cpdef bint is_oft_title(size_t lex_id):
-    '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
-    stores whether the lowered version of the string hashed by `lex' is found
-    title-cased frequently in a large sample of text.  Users are free
-    to load different data, by default we use a sample from Wikipedia, with
-    a threshold of 0.3, picked to maximize mutual information for POS tagging.
-
-    >>> is_oft_title(lookup(u'marcus'))
-    True
-    >>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
-    True
-    '''
-    return (<Lexeme*>lex_id).dist.flags & OFT_TITLE
diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index 6175ec3a8..6d944eb25 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -32,12 +32,9 @@ cdef enum DistFlag:
 
 
 cdef struct Orthography:
-    StringHash last3
     StringHash shape
     StringHash norm
-
-    size_t length
-    Py_UNICODE first
+    StringHash last3
     Bits8 flags
 
 
@@ -49,12 +46,17 @@ cdef struct Distribution:
 
 
 cdef struct Lexeme:
-    StringHash lex # Hash of the word
-    Orthography* orth  # Extra orthographic views
-    Distribution* dist # Distribution info
+    char* string
+    size_t length
+    StringHash lex
+    Orthography orth  # Extra orthographic views
+    Distribution dist # Distribution info
 
 
-cdef Lexeme BLANK_WORD = Lexeme(0, NULL, NULL)
+cdef Lexeme BLANK_WORD = Lexeme(NULL, 0, 0,
+    Orthography(0, 0, 0, 0),
+    Distribution(0.0, 0, 0, 0)
+)
 
 
 cdef enum StringAttr:
@@ -68,13 +70,11 @@ cdef enum StringAttr:
 cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0
 
 cpdef StringHash lex_of(size_t lex_id) except 0
-
 cpdef StringHash norm_of(size_t lex_id) except 0
 cpdef StringHash shape_of(size_t lex_id) except 0
 cpdef StringHash last3_of(size_t lex_id) except 0
 
 cpdef size_t length_of(size_t lex_id) except *
-cpdef Py_UNICODE first_of(size_t lex_id) except *
 
 cpdef double prob_of(size_t lex_id) except 0
 cpdef ClusterID cluster_of(size_t lex_id) except 0
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 27ca13bd7..37392637b 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -72,7 +72,7 @@ cpdef StringHash last3_of(size_t lex_id) except 0:
     return (<Lexeme*>lex_id).orth.last3
 
 
-cpdef ClusterID cluster_of(size_t lex_id):
+cpdef ClusterID cluster_of(size_t lex_id) except 0:
     '''Access the `cluster' field of the Lexeme pointed to by lex_id, which
     gives an integer representation of the cluster ID of the word, 
     which should be understood as a binary address:
@@ -99,21 +99,17 @@ cpdef Py_UNICODE first_of(size_t lex_id):
     >>> unhash(first_of(lex_id))
     u'H'
     '''
-    if (<Lexeme*>lex_id).orth == NULL:
-        return 0
     return (<Lexeme*>lex_id).orth.first
 
 
-cpdef StringHash length_of(size_t lex_id):
+cpdef size_t length_of(size_t lex_id) except *:
     '''Access the `length' field of the Lexeme pointed to by lex_id, which stores
     the length of the string hashed by lex_of.'''
     cdef Lexeme* word = <Lexeme*>lex_id
-    if (<Lexeme*>lex_id).orth == NULL:
-        return 0
-    return (<Lexeme*>lex_id).orth.length
+    return word.length
 
 
-cpdef double prob_of(size_t lex_id):
+cpdef double prob_of(size_t lex_id) except 0:
     '''Access the `prob' field of the Lexeme pointed to by lex_id, which stores
     the smoothed unigram log probability of the word, as estimated from a large
     text corpus.  By default, probabilities are based on counts from Gigaword,
@@ -126,9 +122,38 @@ cpdef double prob_of(size_t lex_id):
     return (<Lexeme*>lex_id).dist.prob
 
 
-cpdef bint check_orth_flag(size_t lex, OrthFlag flag) except *:
+cpdef bint is_oft_upper(size_t lex_id):
+    '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
+    stores whether the lowered version of the string hashed by `lex' is found
+    in all-upper case frequently in a large sample of text.  Users are free
+    to load different data, by default we use a sample from Wikipedia, with
+    a threshold of 0.95, picked to maximize mutual information for POS tagging.
+
+    >>> is_oft_upper(lookup(u'abc'))
+    True
+    >>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
+    True
+    '''
+    return (<Lexeme*>lex_id).dist.flags & OFT_UPPER
+
+
+cpdef bint is_oft_title(size_t lex_id):
+    '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
+    stores whether the lowered version of the string hashed by `lex' is found
+    title-cased frequently in a large sample of text.  Users are free
+    to load different data, by default we use a sample from Wikipedia, with
+    a threshold of 0.3, picked to maximize mutual information for POS tagging.
+
+    >>> is_oft_title(lookup(u'marcus'))
+    True
+    >>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
+    True
+    '''
+    return (<Lexeme*>lex_id).dist.flags & OFT_TITLE
+
+cpdef bint check_orth_flag(size_t lex_id, OrthFlag flag) except *:
     return (<Lexeme*>lex_id).orth.flags & (1 << flag)
 
 
-cpdef bint check_dist_flag(size_t lex, DistFlag flag) except *:
+cpdef bint check_dist_flag(size_t lex_id, DistFlag flag) except *:
     return (<Lexeme*>lex_id).dist.flags & (1 << flag)
diff --git a/spacy/spacy.pxd b/spacy/spacy.pxd
index 344b3577c..b9caac34f 100644
--- a/spacy/spacy.pxd
+++ b/spacy/spacy.pxd
@@ -21,7 +21,6 @@ ctypedef int ClusterID
 from spacy.lexeme cimport Lexeme
 from spacy.lexeme cimport Distribution
 from spacy.lexeme cimport Orthography
-from spacy._hashing cimport WordTree
 
 
 cdef class Language:
@@ -37,8 +36,6 @@ cdef class Language:
     
     cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL
     cdef Lexeme* new_lexeme(self, unicode lex) except NULL
-    cdef Orthography* new_orth(self, unicode lex) except NULL
-    cdef Distribution* new_dist(self, unicode lex) except NULL
     
     cdef unicode unhash(self, StringHash hashed)
     
diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx
index e4c5cf240..a8b4ebe74 100644
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@@ -13,15 +13,19 @@ from spacy.string_tools cimport substr
 from . import util
 from os import path
 
+DIST_FLAGS = {}
+TAGS = {}
 
-def get_normalized(unicode lex, size_t length):
+
+def get_normalized(unicode lex):
     if lex.isalpha() and lex.islower():
         return lex
     else:
-        return get_word_shape(lex, length)
+        return get_word_shape(lex)
 
 
-def get_word_shape(unicode lex, length):
+def get_word_shape(unicode lex):
+    cdef size_t length = len(lex)
     shape = ""
     last = ""
     shape_char = ""
@@ -47,7 +51,7 @@ def get_word_shape(unicode lex, length):
     return shape
 
 
-def set_orth_flags(lex, length):
+def set_orth_flags(lex):
     return 0
 
 
@@ -60,7 +64,7 @@ cdef class Language:
         self.chunks.set_empty_key(0)
         self.vocab.set_empty_key(0)
         self.load_tokenization(util.read_tokenization(name))
-        self.load_dist_info(util.read_dist_info(name))
+        #self.load_dist_info(util.read_dist_info(name))
 
     cdef Tokens tokenize(self, unicode string):
         cdef Lexeme** chunk
@@ -106,39 +110,25 @@ cdef class Language:
 
     cdef Lexeme* new_lexeme(self, unicode string) except NULL:
         cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
+        cdef bytes byte_string = string.encode('utf8')
+        word.string = <char*>byte_string
+        word.length = len(byte_string)
+        word.orth.flags = set_orth_flags(string)
+        cdef unicode norm = get_normalized(string)
+        cdef unicode shape = get_word_shape(string)
+        cdef unicode last3 = string[-3:]
         word.lex = hash(string)
+        word.orth.norm = hash(norm)
+        word.orth.shape = hash(shape)
+        word.orth.last3 = hash(last3)
         self.bacov[word.lex] = string
-        word.orth = self.new_orth(string)
+        self.bacov[word.orth.norm] = norm
+        self.bacov[word.orth.shape] = shape
+        self.bacov[word.orth.last3] = last3
 
-        word.dist = <Distribution*>calloc(1, sizeof(Distribution))
-        self.vocab[word.lex] = <size_t>word
+        self.vocab[hash(string)] = <size_t>word
         return word
 
-    cdef Orthography* new_orth(self, unicode lex) except NULL:
-        cdef unicode last3
-        cdef unicode norm
-        cdef unicode shape
-        cdef int length 
-
-        length = len(lex)
-        orth = <Orthography*>calloc(1, sizeof(Orthography))
-        orth.first = lex[0]
-            
-        orth.length = length
-        orth.flags = set_orth_flags(lex, orth.length)
-        orth.norm = hash(lex)
-        last3 = substr(lex, length - 3, length, length)
-        orth.last3 = hash(last3)
-        norm = get_normalized(lex, length)
-        orth.norm = hash(norm)
-        shape = get_word_shape(lex, length)
-        orth.shape = hash(shape)
-
-        self.bacov[orth.last3] = last3
-        self.bacov[orth.norm] = norm
-        self.bacov[orth.shape] = shape
-        return orth
-
     cdef unicode unhash(self, StringHash hash_value):
         '''Fetch a string from the reverse index, given its hash value.'''
         return self.bacov[hash_value]
@@ -167,12 +157,12 @@ cdef class Language:
         cdef Lexeme* w
         for string, word_dist in dist_info.items():
             w = self.lookup(string)
-            w.prob = word_dist.prob
-            w.cluster = word_dist.cluster
+            w.dist.prob = word_dist.prob
+            w.dist.cluster = word_dist.cluster
             for flag in word_dist.flags:
-                w.flags |= lexeme.DIST_FLAGS[flag]
+                w.dist.flags |= DIST_FLAGS[flag]
             for tag in word_dist.tagdict:
-                w.tagdict |= lexeme.TAGS[tag]
+                w.dist.tagdict |= TAGS[tag]
 
 
 cdef inline bint _is_whitespace(Py_UNICODE c) nogil: