* Refactor for string view features. Working on setting up flags and enums.

2025-11-02 00:47:52 +03:00 · 2014-07-07 16:58:48 +02:00 · 2014-07-07 16:58:48 +02:00 · 057c21969b
commit 057c21969b
parent 9fd085bf90
11 changed files with 167 additions and 79 deletions
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -1,8 +1,8 @@
 from libcpp.vector cimport vector
 from spacy.spacy cimport StringHash
-from spacy.lexeme cimport Lexeme
+from spacy.spacy cimport Lexeme
-from spacy.lexeme cimport Lexeme_addr
+from spacy.spacy cimport Lexeme_addr
 from spacy.spacy cimport Language
 from spacy.tokens cimport Tokens
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -9,7 +9,6 @@ from libc.stdlib cimport malloc, calloc, free
 from libc.stdint cimport uint64_t
 from libcpp.vector cimport vector
 from spacy.lexeme cimport Lexeme
 from spacy.string_tools cimport substr
 from . import util
--- a/spacy/en_ptb.pxd
+++ b/spacy/en_ptb.pxd
@ -2,8 +2,8 @@ from libcpp.vector cimport vector
 from spacy.spacy cimport StringHash
 from spacy.spacy cimport Language
-from spacy.lexeme cimport Lexeme
+from spacy.spacy cimport Lexeme
-from spacy.lexeme cimport Lexeme_addr
+from spacy.spacy cimport Lexeme_addr
 from spacy.tokens cimport Tokens
--- a/spacy/en_ptb.pyx
+++ b/spacy/en_ptb.pyx
@ -4,11 +4,11 @@ boldly assume no collisions.
 '''
 from __future__ import unicode_literals
 from libc.stdlib cimport malloc, calloc, free
 from libc.stdint cimport uint64_t
 from libcpp.vector cimport vector
 from spacy.lexeme cimport Lexeme
 from spacy.string_tools cimport substr
 from spacy.spacy cimport Language
 from . import util
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -4,29 +4,48 @@ from libc.stdint cimport uint64_t
 ctypedef int ClusterID
 ctypedef uint64_t StringHash
 ctypedef size_t Lexeme_addr
 ctypedef char Bits8
 ctypedef uint64_t Bits64
 from spacy.spacy cimport Language
 cdef struct Orthography:
    StringHash last3
    StringHash shape
    StringHash norm
    Py_UNICODE first
    Bits8 flags
 cdef struct Distribution:
    double prob
    ClusterID cluster
    Bits64 tagdict
    Bits8 flags
 cdef struct Lexeme:
    StringHash sic # Hash of the original string
    StringHash lex # Hash of the word, with punctuation and clitics split off
    StringHash normed # Hash of the normalized version of lex
    StringHash last3 # Last 3 characters of the token
    Py_UNICODE first # First character of the token
-    double prob # What is the log probability of the lex value?
+    Distribution* dist # Distribution info, lazy loaded
-    ClusterID cluster # Brown cluster of the token
+    Orthography* orth  # Extra orthographic views
    bint oft_upper # Is the lowered version of the lex value often in all caps?
    bint oft_title # Is the lowered version of the lex value often title-cased?
    Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens
-cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
+cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL)
-cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed,
+
-                         int split, size_t length)
+cpdef StringHash lex_of(size_t lex_id) except 0
 cpdef StringHash norm_of(size_t lex_id) except 0
 #cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed,
 #                         int split, size_t length)
 # Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which
 # has a conditional to pick out the correct item.  This allows safe iteration
 # over the Lexeme, via:
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -11,49 +11,7 @@ from libc.stdlib cimport malloc, calloc, free
 from libc.stdint cimport uint64_t
 from libcpp.vector cimport vector
-
+from spacy.spacy cimport StringHash
 cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed,
                         int split, size_t length):
    assert split <= length
    cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
    word.first = <Py_UNICODE>(string[0] if string else 0)
    word.sic = hashed
    cdef unicode tail_string
    cdef unicode lex 
    if split != 0 and split < length:
        lex = substr(string, 0, split, length)
        tail_string = substr(string, split, length, length)
    else:
        lex = string
        tail_string = ''
    assert lex
    #cdef unicode normed = normalize_word_string(lex)
    cdef unicode normed = '?'
    cdef unicode last3 = substr(string, length - 3, length, length)
    assert normed
    assert len(normed)
    word.lex = lang.hash_string(lex, len(lex))
    word.normed = lang.hash_string(normed, len(normed))
    word.last3 = lang.hash_string(last3, len(last3))
    lang.bacov[word.lex] = lex
    lang.bacov[word.normed] = normed
    lang.bacov[word.last3] = last3
    # These are loaded later
    word.prob = 0
    word.cluster = 0
    word.oft_upper = False
    word.oft_title = False
    # Now recurse, and deal with the tail
    if tail_string:
        word.tail = <Lexeme*>lang.lookup(-1, tail_string, len(tail_string))
    return word
 cpdef StringHash sic_of(size_t lex_id) except 0:
@ -82,6 +40,20 @@ cpdef StringHash lex_of(size_t lex_id) except 0:
    return (<Lexeme*>lex_id).lex
 cpdef StringHash norm_of(size_t lex_id) except 0:
    '''Access the `lex' field of the Lexeme pointed to by lex_id.
    The lex field is the hash of the string you would expect to get back from
    a standard tokenizer, i.e. the word with punctuation and other non-whitespace
    delimited tokens split off.  The other fields refer to properties of the
    string that the lex field stores a hash of, except sic and tail.
    >>> [unhash(lex_of(lex_id) for lex_id in from_string(u'Hi! world')]
    [u'Hi', u'!', u'world']
    '''
    return (<Lexeme*>lex_id).orth.norm
 cpdef ClusterID cluster_of(size_t lex_id):
    '''Access the `cluster' field of the Lexeme pointed to by lex_id, which
    gives an integer representation of the cluster ID of the word, 
@ -98,7 +70,7 @@ cpdef ClusterID cluster_of(size_t lex_id):
    while "dapple" is totally different. On the other hand, "scalable" receives
    the same cluster ID as "pineapple", which is not what we'd like.
    '''
-    return (<Lexeme*>lex_id).cluster
+    return (<Lexeme*>lex_id).dist.cluster
 cpdef Py_UNICODE first_of(size_t lex_id):
@ -109,7 +81,7 @@ cpdef Py_UNICODE first_of(size_t lex_id):
    >>> unhash(first_of(lex_id))
    u'H'
    '''
-    return (<Lexeme*>lex_id).first
+    return (<Lexeme*>lex_id).orth.first
 cpdef double prob_of(size_t lex_id):
@ -133,7 +105,8 @@ cpdef StringHash last3_of(size_t lex_id):
    >>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
    [u'llo', u'!']
    '''
-    return (<Lexeme*>lex_id).last3
+    cdef Lexeme* w = <Lexeme*>lex_id
    return w.orth.last3 if w.orth != NULL else 0
 cpdef bint is_oft_upper(size_t lex_id):
@ -148,7 +121,12 @@ cpdef bint is_oft_upper(size_t lex_id):
    >>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
    True
    '''
-    return (<Lexeme*>lex_id).oft_upper
+    return False
    #cdef Lexeme* w = <Lexeme*>lex_id
    #return w.orth.last3 if w.orth != NULL else 0
    #return (<Lexeme*>lex_id).oft_upper
 cpdef bint is_oft_title(size_t lex_id):
@ -163,4 +141,5 @@ cpdef bint is_oft_title(size_t lex_id):
    >>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
    True
    '''
-    return (<Lexeme*>lex_id).oft_title
+    return False
    #return (<Lexeme*>lex_id).oft_title
--- a/spacy/spacy.pxd
+++ b/spacy/spacy.pxd
@ -7,16 +7,27 @@ from ext.sparsehash cimport dense_hash_map
 # Circular import problems here
 ctypedef size_t Lexeme_addr
 ctypedef uint64_t StringHash
-ctypedef dense_hash_map[StringHash, Lexeme_addr] Vocab
+ctypedef dense_hash_map[StringHash, size_t] Vocab
-ctypedef int (*Splitter)(unicode word, size_t length)
+from spacy.lexeme cimport Lexeme
 from spacy.tokens cimport Tokens
 # Put these above import to avoid circular import problem
 ctypedef char Bits8
 ctypedef uint64_t Bits64
 ctypedef int ClusterID
 from spacy.lexeme cimport Lexeme
-from spacy.tokens cimport Tokens
+from spacy.lexeme cimport Distribution
 from spacy.lexeme cimport Orthography
 cdef class Language:
    cdef object name
    cdef Vocab* vocab
    cdef Vocab* distri
    cdef Vocab* ortho
    cdef dict bacov
    cdef int find_split(self, unicode word, size_t length)
@ -26,3 +37,8 @@ cdef class Language:
    cpdef Tokens tokenize(self, unicode text)
    cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length)
    cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed,
                             int split, size_t length)
    cdef Orthography* init_orth(self, StringHash hashed, unicode lex)
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@ -6,22 +6,43 @@ from libc.stdlib cimport calloc, free
 from ext.murmurhash cimport MurmurHash64A
 from ext.murmurhash cimport MurmurHash64B
-from spacy.lexeme cimport init_lexeme
+from spacy.lexeme cimport Lexeme
 from spacy.lexeme cimport BLANK_WORD
-from spacy.string_tools cimport is_whitespace
+from spacy.string_tools cimport substr
 from . import util
 from os import path
 cimport cython
 def get_normalized(unicode lex, size_t length):
    return lex.lower()
    #if lex.isdigit():
    #    return '!YEAR' if length == 4 else '!DIGIT'
    #else:
    #    return lex.lower()
 def get_word_shape(lex, length):
    return lex
 def set_orth_flags(lex, length):
    return 0
 cdef class Language:
    def __cinit__(self, name):
        self.name = name
        self.bacov = {}
        self.vocab = new Vocab()
        self.ortho = new Vocab()
        self.distri = new Vocab()
        self.vocab[0].set_empty_key(0)
        self.distri[0].set_empty_key(0)
        self.ortho[0].set_empty_key(0)
        self.load_tokenization(util.read_tokenization(name))
    def load_tokenization(self, token_rules=None):
@ -80,7 +101,7 @@ cdef class Language:
        return <Lexeme_addr>word_ptr
    cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
-        word = init_lexeme(self, string, hashed, split, length)
+        word = self.init_lexeme(string, hashed, split, length)
        self.vocab[0][hashed] = <Lexeme_addr>word
        self.bacov[hashed] = string
        return word   
@ -121,6 +142,55 @@ cdef class Language:
    cdef int find_split(self, unicode word, size_t length):
        return -1
    cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed,
                             int split, size_t length):
        cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
        word.sic = hashed
        cdef unicode tail_string
        cdef unicode lex 
        if split != 0 and split < length:
            lex = substr(string, 0, split, length)
            tail_string = substr(string, split, length, length)
        else:
            lex = string
            tail_string = ''
        word.lex = self.hash_string(lex, len(lex))
        self.bacov[word.lex] = lex
        word.orth = <Orthography*>self.ortho[0][word.lex]
        if word.orth == NULL:
            word.orth = self.init_orth(word.lex, lex)
        word.dist = <Distribution*>self.distri[0][word.lex]
        # Now recurse, and deal with the tail
        if tail_string:
            word.tail = <Lexeme*>self.lookup(-1, tail_string, len(tail_string))
        return word
    cdef Orthography* init_orth(self, StringHash hashed, unicode lex):
        cdef Orthography* orth = <Orthography*>calloc(1, sizeof(Orthography))
        orth.first = <Py_UNICODE>lex[0]
        cdef int length = len(lex)
        orth.flags = set_orth_flags(lex, length)
        cdef unicode last3 = substr(lex, length - 3, length, length)
        cdef unicode norm = get_normalized(lex, length)
        cdef unicode shape = get_word_shape(lex, length)
        orth.last3 = self.hash_string(last3, len(last3))
        orth.shape = self.hash_string(shape, len(shape))
        orth.norm = self.hash_string(norm, len(norm))
        self.bacov[orth.last3] = last3
        self.bacov[orth.shape] = shape
        self.bacov[orth.norm] = norm
        self.ortho[0][hashed] = <size_t>orth
        return orth
 cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
@ -137,7 +207,7 @@ cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
 cpdef vector[size_t] expand_chunk(size_t addr) except *:
    cdef vector[size_t] tokens = vector[size_t]()
    word = <Lexeme*>addr
-    while word is not NULL:
+    while word != NULL:
        tokens.push_back(<size_t>word)
        word = word.tail
    return tokens
--- a/spacy/string_tools.pyx
+++ b/spacy/string_tools.pyx
@ -1,5 +1,6 @@
 # cython: profile=True
 cpdef unicode substr(unicode string, int start, int end, size_t length):
    if end >= length:
        end = -1
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -1,6 +1,5 @@
 from libcpp.vector cimport vector
-from spacy.lexeme cimport Lexeme
+from spacy.spacy cimport Lexeme_addr
 from spacy.lexeme cimport Lexeme_addr
 from cython.operator cimport dereference as deref
 from spacy.spacy cimport Language
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -2,6 +2,11 @@ from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as inc
 from spacy.lexeme cimport Lexeme
 from spacy.lexeme cimport norm_of
 from spacy.spacy cimport StringHash
 cdef class Tokens:
    def __cinit__(self, Language lang):
        self.lang = lang
@ -38,11 +43,11 @@ cdef class Tokens:
    cpdef dict count_by(self, Field attr):
        counts = {}
        cdef Lexeme_addr t
-        cdef Lexeme* word
+        cdef StringHash key
        for t in self.vctr[0]:
-            word = <Lexeme*>t
+            key = norm_of(t)
-            if word.lex not in counts:
+            if key not in counts:
-                counts[word.lex] = 0
+                counts[key] = 0
-            counts[word.lex] += 1
+            counts[key] += 1
        return counts