* Refactor for string view features. Working on setting up flags and enums.

2025-08-09 06:34:54 +03:00 · 2014-07-07 16:58:48 +02:00 · 2014-07-07 16:58:48 +02:00 · 057c21969b
commit 057c21969b
parent 9fd085bf90
11 changed files with 167 additions and 79 deletions
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -1,8 +1,8 @@
 from libcpp.vector cimport vector

 from spacy.spacy cimport StringHash
-from spacy.lexeme cimport Lexeme
-from spacy.lexeme cimport Lexeme_addr
+from spacy.spacy cimport Lexeme
+from spacy.spacy cimport Lexeme_addr

 from spacy.spacy cimport Language
 from spacy.tokens cimport Tokens
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -9,7 +9,6 @@ from libc.stdlib cimport malloc, calloc, free
 from libc.stdint cimport uint64_t
 from libcpp.vector cimport vector

-from spacy.lexeme cimport Lexeme
 from spacy.string_tools cimport substr

 from . import util
--- a/spacy/en_ptb.pxd
+++ b/spacy/en_ptb.pxd
@ -2,8 +2,8 @@ from libcpp.vector cimport vector

 from spacy.spacy cimport StringHash
 from spacy.spacy cimport Language
-from spacy.lexeme cimport Lexeme
-from spacy.lexeme cimport Lexeme_addr
+from spacy.spacy cimport Lexeme
+from spacy.spacy cimport Lexeme_addr
 from spacy.tokens cimport Tokens


--- a/spacy/en_ptb.pyx
+++ b/spacy/en_ptb.pyx
@ -4,11 +4,11 @@ boldly assume no collisions.
 '''
 from __future__ import unicode_literals

+
 from libc.stdlib cimport malloc, calloc, free
 from libc.stdint cimport uint64_t
 from libcpp.vector cimport vector

-from spacy.lexeme cimport Lexeme
 from spacy.string_tools cimport substr
 from spacy.spacy cimport Language
 from . import util
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -4,29 +4,48 @@ from libc.stdint cimport uint64_t
 ctypedef int ClusterID
 ctypedef uint64_t StringHash
 ctypedef size_t Lexeme_addr
+ctypedef char Bits8
+ctypedef uint64_t Bits64
+

 from spacy.spacy cimport Language

+
+cdef struct Orthography:
+    StringHash last3
+    StringHash shape
+    StringHash norm
+
+    Py_UNICODE first
+    Bits8 flags
+
+
+cdef struct Distribution:
+    double prob
+    ClusterID cluster
+    Bits64 tagdict
+    Bits8 flags
+
+
 cdef struct Lexeme:
    StringHash sic # Hash of the original string
    StringHash lex # Hash of the word, with punctuation and clitics split off
-    StringHash normed # Hash of the normalized version of lex
-    StringHash last3 # Last 3 characters of the token
-    Py_UNICODE first # First character of the token

-    double prob # What is the log probability of the lex value?
-    ClusterID cluster # Brown cluster of the token
-
-    bint oft_upper # Is the lowered version of the lex value often in all caps?
-    bint oft_title # Is the lowered version of the lex value often title-cased?
+    Distribution* dist # Distribution info, lazy loaded
+    Orthography* orth  # Extra orthographic views
    Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens


-cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
+cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL)

-cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed,
-                         int split, size_t length)
+
+cpdef StringHash lex_of(size_t lex_id) except 0
+cpdef StringHash norm_of(size_t lex_id) except 0
+#cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed,
+#                         int split, size_t length)
+                         
 
+
 # Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which
 # has a conditional to pick out the correct item.  This allows safe iteration
 # over the Lexeme, via:
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -11,49 +11,7 @@ from libc.stdlib cimport malloc, calloc, free
 from libc.stdint cimport uint64_t
 from libcpp.vector cimport vector

-
-cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed,
-                         int split, size_t length):
-    assert split <= length
-    cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
-
-    word.first = <Py_UNICODE>(string[0] if string else 0)
-    word.sic = hashed
-    
-    cdef unicode tail_string
-    cdef unicode lex 
-    if split != 0 and split < length:
-        lex = substr(string, 0, split, length)
-        tail_string = substr(string, split, length, length)
-    else:
-        lex = string
-        tail_string = ''
-    assert lex
-    #cdef unicode normed = normalize_word_string(lex)
-    cdef unicode normed = '?'
-    cdef unicode last3 = substr(string, length - 3, length, length)
-
-    assert normed
-    assert len(normed)
-    
-    word.lex = lang.hash_string(lex, len(lex))
-    word.normed = lang.hash_string(normed, len(normed))
-    word.last3 = lang.hash_string(last3, len(last3))
-
-    lang.bacov[word.lex] = lex
-    lang.bacov[word.normed] = normed
-    lang.bacov[word.last3] = last3
-
-    # These are loaded later
-    word.prob = 0
-    word.cluster = 0
-    word.oft_upper = False
-    word.oft_title = False
-    
-    # Now recurse, and deal with the tail
-    if tail_string:
-        word.tail = <Lexeme*>lang.lookup(-1, tail_string, len(tail_string))
-    return word
+from spacy.spacy cimport StringHash


 cpdef StringHash sic_of(size_t lex_id) except 0:
@ -82,6 +40,20 @@ cpdef StringHash lex_of(size_t lex_id) except 0:
    return (<Lexeme*>lex_id).lex


+cpdef StringHash norm_of(size_t lex_id) except 0:
+    '''Access the `lex' field of the Lexeme pointed to by lex_id.
+
+    The lex field is the hash of the string you would expect to get back from
+    a standard tokenizer, i.e. the word with punctuation and other non-whitespace
+    delimited tokens split off.  The other fields refer to properties of the
+    string that the lex field stores a hash of, except sic and tail.
+
+    >>> [unhash(lex_of(lex_id) for lex_id in from_string(u'Hi! world')]
+    [u'Hi', u'!', u'world']
+    '''
+    return (<Lexeme*>lex_id).orth.norm
+
+
 cpdef ClusterID cluster_of(size_t lex_id):
    '''Access the `cluster' field of the Lexeme pointed to by lex_id, which
    gives an integer representation of the cluster ID of the word, 
@ -98,7 +70,7 @@ cpdef ClusterID cluster_of(size_t lex_id):
    while "dapple" is totally different. On the other hand, "scalable" receives
    the same cluster ID as "pineapple", which is not what we'd like.
    '''
-    return (<Lexeme*>lex_id).cluster
+    return (<Lexeme*>lex_id).dist.cluster


 cpdef Py_UNICODE first_of(size_t lex_id):
@ -109,7 +81,7 @@ cpdef Py_UNICODE first_of(size_t lex_id):
    >>> unhash(first_of(lex_id))
    u'H'
    '''
-    return (<Lexeme*>lex_id).first
+    return (<Lexeme*>lex_id).orth.first


 cpdef double prob_of(size_t lex_id):
@ -133,7 +105,8 @@ cpdef StringHash last3_of(size_t lex_id):
    >>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
    [u'llo', u'!']
    '''
-    return (<Lexeme*>lex_id).last3
+    cdef Lexeme* w = <Lexeme*>lex_id
+    return w.orth.last3 if w.orth != NULL else 0


 cpdef bint is_oft_upper(size_t lex_id):
@ -148,7 +121,12 @@ cpdef bint is_oft_upper(size_t lex_id):
    >>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
    True
    '''
-    return (<Lexeme*>lex_id).oft_upper
+    return False
+    #cdef Lexeme* w = <Lexeme*>lex_id
+    #return w.orth.last3 if w.orth != NULL else 0
+
+
+    #return (<Lexeme*>lex_id).oft_upper


 cpdef bint is_oft_title(size_t lex_id):
@ -163,4 +141,5 @@ cpdef bint is_oft_title(size_t lex_id):
    >>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
    True
    '''
-    return (<Lexeme*>lex_id).oft_title
+    return False
+    #return (<Lexeme*>lex_id).oft_title
--- a/spacy/spacy.pxd
+++ b/spacy/spacy.pxd
@ -7,16 +7,27 @@ from ext.sparsehash cimport dense_hash_map
 # Circular import problems here
 ctypedef size_t Lexeme_addr
 ctypedef uint64_t StringHash
-ctypedef dense_hash_map[StringHash, Lexeme_addr] Vocab
-ctypedef int (*Splitter)(unicode word, size_t length)
+ctypedef dense_hash_map[StringHash, size_t] Vocab
+from spacy.lexeme cimport Lexeme
+
+from spacy.tokens cimport Tokens
+
+# Put these above import to avoid circular import problem
+ctypedef char Bits8
+ctypedef uint64_t Bits64
+ctypedef int ClusterID


 from spacy.lexeme cimport Lexeme
-from spacy.tokens cimport Tokens
+from spacy.lexeme cimport Distribution
+from spacy.lexeme cimport Orthography
+

 cdef class Language:
    cdef object name
    cdef Vocab* vocab
+    cdef Vocab* distri
+    cdef Vocab* ortho
    cdef dict bacov
    cdef int find_split(self, unicode word, size_t length)

@ -26,3 +37,8 @@ cdef class Language:
    
    cpdef Tokens tokenize(self, unicode text)
    cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length)
+    cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed,
+                             int split, size_t length)
+    cdef Orthography* init_orth(self, StringHash hashed, unicode lex)
+
+ 
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@ -6,22 +6,43 @@ from libc.stdlib cimport calloc, free
 from ext.murmurhash cimport MurmurHash64A
 from ext.murmurhash cimport MurmurHash64B

-from spacy.lexeme cimport init_lexeme
+from spacy.lexeme cimport Lexeme
 from spacy.lexeme cimport BLANK_WORD

-from spacy.string_tools cimport is_whitespace
+from spacy.string_tools cimport substr
+

 from . import util
 from os import path
 cimport cython

+def get_normalized(unicode lex, size_t length):
+    return lex.lower()
+    #if lex.isdigit():
+    #    return '!YEAR' if length == 4 else '!DIGIT'
+    #else:
+    #    return lex.lower()
+
+
+def get_word_shape(lex, length):
+    return lex
+
+
+def set_orth_flags(lex, length):
+    return 0
+
+

 cdef class Language:
    def __cinit__(self, name):
        self.name = name
        self.bacov = {}
        self.vocab = new Vocab()
+        self.ortho = new Vocab()
+        self.distri = new Vocab()
        self.vocab[0].set_empty_key(0)
+        self.distri[0].set_empty_key(0)
+        self.ortho[0].set_empty_key(0)
        self.load_tokenization(util.read_tokenization(name))

    def load_tokenization(self, token_rules=None):
@ -80,7 +101,7 @@ cdef class Language:
        return <Lexeme_addr>word_ptr

    cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
-        word = init_lexeme(self, string, hashed, split, length)
+        word = self.init_lexeme(string, hashed, split, length)
        self.vocab[0][hashed] = <Lexeme_addr>word
        self.bacov[hashed] = string
        return word   
@ -121,6 +142,55 @@ cdef class Language:
    cdef int find_split(self, unicode word, size_t length):
        return -1

+    cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed,
+                             int split, size_t length):
+        cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
+    
+        word.sic = hashed
+    
+        cdef unicode tail_string
+        cdef unicode lex 
+        if split != 0 and split < length:
+            lex = substr(string, 0, split, length)
+            tail_string = substr(string, split, length, length)
+        else:
+            lex = string
+            tail_string = ''
+    
+        word.lex = self.hash_string(lex, len(lex))
+        self.bacov[word.lex] = lex
+        word.orth = <Orthography*>self.ortho[0][word.lex]
+        if word.orth == NULL:
+            word.orth = self.init_orth(word.lex, lex)
+        word.dist = <Distribution*>self.distri[0][word.lex]
+    
+        # Now recurse, and deal with the tail
+        if tail_string:
+            word.tail = <Lexeme*>self.lookup(-1, tail_string, len(tail_string))
+        return word
+
+    cdef Orthography* init_orth(self, StringHash hashed, unicode lex):
+        cdef Orthography* orth = <Orthography*>calloc(1, sizeof(Orthography))
+        orth.first = <Py_UNICODE>lex[0]
+
+        cdef int length = len(lex)
+        
+        orth.flags = set_orth_flags(lex, length)
+        
+        cdef unicode last3 = substr(lex, length - 3, length, length)
+        cdef unicode norm = get_normalized(lex, length)
+        cdef unicode shape = get_word_shape(lex, length)
+
+        orth.last3 = self.hash_string(last3, len(last3))
+        orth.shape = self.hash_string(shape, len(shape))
+        orth.norm = self.hash_string(norm, len(norm))
+
+        self.bacov[orth.last3] = last3
+        self.bacov[orth.shape] = shape
+        self.bacov[orth.norm] = norm
+
+        self.ortho[0][hashed] = <size_t>orth
+        return orth


 cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
@ -137,7 +207,7 @@ cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
 cpdef vector[size_t] expand_chunk(size_t addr) except *:
    cdef vector[size_t] tokens = vector[size_t]()
    word = <Lexeme*>addr
-    while word is not NULL:
+    while word != NULL:
        tokens.push_back(<size_t>word)
        word = word.tail
    return tokens
--- a/spacy/string_tools.pyx
+++ b/spacy/string_tools.pyx
@ -1,5 +1,6 @@
 # cython: profile=True

+
 cpdef unicode substr(unicode string, int start, int end, size_t length):
    if end >= length:
        end = -1
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -1,6 +1,5 @@
 from libcpp.vector cimport vector
-from spacy.lexeme cimport Lexeme
-from spacy.lexeme cimport Lexeme_addr
+from spacy.spacy cimport Lexeme_addr

 from cython.operator cimport dereference as deref
 from spacy.spacy cimport Language
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -2,6 +2,11 @@ from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as inc


+from spacy.lexeme cimport Lexeme
+from spacy.lexeme cimport norm_of
+from spacy.spacy cimport StringHash
+
+
 cdef class Tokens:
    def __cinit__(self, Language lang):
        self.lang = lang
@ -38,11 +43,11 @@ cdef class Tokens:
    cpdef dict count_by(self, Field attr):
        counts = {}
        cdef Lexeme_addr t
-        cdef Lexeme* word
+        cdef StringHash key
        for t in self.vctr[0]:
-            word = <Lexeme*>t
-            if word.lex not in counts:
-                counts[word.lex] = 0
-            counts[word.lex] += 1
+            key = norm_of(t)
+            if key not in counts:
+                counts[key] = 0
+            counts[key] += 1
        return counts