* Working refactor, with updated data model for Lexemes

2025-07-05 20:33:10 +03:00 · 2014-08-19 04:21:20 +02:00 · 2014-08-19 04:21:20 +02:00 · 5fddb8d165
commit 5fddb8d165
parent 3379d7a571
5 changed files with 72 additions and 88 deletions
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -66,31 +66,3 @@ cpdef unicode unhash(StringHash hash_value):
    return EN.unhash(hash_value)
 cpdef bint is_oft_upper(size_t lex_id):
    '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
    stores whether the lowered version of the string hashed by `lex' is found
    in all-upper case frequently in a large sample of text.  Users are free
    to load different data, by default we use a sample from Wikipedia, with
    a threshold of 0.95, picked to maximize mutual information for POS tagging.
    >>> is_oft_upper(lookup(u'abc'))
    True
    >>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
    True
    '''
    return (<Lexeme*>lex_id).dist.flags & OFT_UPPER
 cpdef bint is_oft_title(size_t lex_id):
    '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
    stores whether the lowered version of the string hashed by `lex' is found
    title-cased frequently in a large sample of text.  Users are free
    to load different data, by default we use a sample from Wikipedia, with
    a threshold of 0.3, picked to maximize mutual information for POS tagging.
    >>> is_oft_title(lookup(u'marcus'))
    True
    >>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
    True
    '''
    return (<Lexeme*>lex_id).dist.flags & OFT_TITLE
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -32,12 +32,9 @@ cdef enum DistFlag:
 cdef struct Orthography:
    StringHash last3
    StringHash shape
    StringHash norm
-
+    StringHash last3
    size_t length
    Py_UNICODE first
    Bits8 flags
@ -49,12 +46,17 @@ cdef struct Distribution:
 cdef struct Lexeme:
-    StringHash lex # Hash of the word
+    char* string
-    Orthography* orth  # Extra orthographic views
+    size_t length
-    Distribution* dist # Distribution info
+    StringHash lex
    Orthography orth  # Extra orthographic views
    Distribution dist # Distribution info
-cdef Lexeme BLANK_WORD = Lexeme(0, NULL, NULL)
+cdef Lexeme BLANK_WORD = Lexeme(NULL, 0, 0,
    Orthography(0, 0, 0, 0),
    Distribution(0.0, 0, 0, 0)
 )
 cdef enum StringAttr:
@ -68,13 +70,11 @@ cdef enum StringAttr:
 cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0
 cpdef StringHash lex_of(size_t lex_id) except 0
 cpdef StringHash norm_of(size_t lex_id) except 0
 cpdef StringHash shape_of(size_t lex_id) except 0
 cpdef StringHash last3_of(size_t lex_id) except 0
 cpdef size_t length_of(size_t lex_id) except *
 cpdef Py_UNICODE first_of(size_t lex_id) except *
 cpdef double prob_of(size_t lex_id) except 0
 cpdef ClusterID cluster_of(size_t lex_id) except 0
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -72,7 +72,7 @@ cpdef StringHash last3_of(size_t lex_id) except 0:
    return (<Lexeme*>lex_id).orth.last3
-cpdef ClusterID cluster_of(size_t lex_id):
+cpdef ClusterID cluster_of(size_t lex_id) except 0:
    '''Access the `cluster' field of the Lexeme pointed to by lex_id, which
    gives an integer representation of the cluster ID of the word, 
    which should be understood as a binary address:
@ -99,21 +99,17 @@ cpdef Py_UNICODE first_of(size_t lex_id):
    >>> unhash(first_of(lex_id))
    u'H'
    '''
    if (<Lexeme*>lex_id).orth == NULL:
        return 0
    return (<Lexeme*>lex_id).orth.first
-cpdef StringHash length_of(size_t lex_id):
+cpdef size_t length_of(size_t lex_id) except *:
    '''Access the `length' field of the Lexeme pointed to by lex_id, which stores
    the length of the string hashed by lex_of.'''
    cdef Lexeme* word = <Lexeme*>lex_id
-    if (<Lexeme*>lex_id).orth == NULL:
+    return word.length
        return 0
    return (<Lexeme*>lex_id).orth.length
-cpdef double prob_of(size_t lex_id):
+cpdef double prob_of(size_t lex_id) except 0:
    '''Access the `prob' field of the Lexeme pointed to by lex_id, which stores
    the smoothed unigram log probability of the word, as estimated from a large
    text corpus.  By default, probabilities are based on counts from Gigaword,
@ -126,9 +122,38 @@ cpdef double prob_of(size_t lex_id):
    return (<Lexeme*>lex_id).dist.prob
-cpdef bint check_orth_flag(size_t lex, OrthFlag flag) except *:
+cpdef bint is_oft_upper(size_t lex_id):
    '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
    stores whether the lowered version of the string hashed by `lex' is found
    in all-upper case frequently in a large sample of text.  Users are free
    to load different data, by default we use a sample from Wikipedia, with
    a threshold of 0.95, picked to maximize mutual information for POS tagging.
    >>> is_oft_upper(lookup(u'abc'))
    True
    >>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
    True
    '''
    return (<Lexeme*>lex_id).dist.flags & OFT_UPPER
 cpdef bint is_oft_title(size_t lex_id):
    '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
    stores whether the lowered version of the string hashed by `lex' is found
    title-cased frequently in a large sample of text.  Users are free
    to load different data, by default we use a sample from Wikipedia, with
    a threshold of 0.3, picked to maximize mutual information for POS tagging.
    >>> is_oft_title(lookup(u'marcus'))
    True
    >>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
    True
    '''
    return (<Lexeme*>lex_id).dist.flags & OFT_TITLE
 cpdef bint check_orth_flag(size_t lex_id, OrthFlag flag) except *:
    return (<Lexeme*>lex_id).orth.flags & (1 << flag)
-cpdef bint check_dist_flag(size_t lex, DistFlag flag) except *:
+cpdef bint check_dist_flag(size_t lex_id, DistFlag flag) except *:
    return (<Lexeme*>lex_id).dist.flags & (1 << flag)
--- a/spacy/spacy.pxd
+++ b/spacy/spacy.pxd
@ -21,7 +21,6 @@ ctypedef int ClusterID
 from spacy.lexeme cimport Lexeme
 from spacy.lexeme cimport Distribution
 from spacy.lexeme cimport Orthography
 from spacy._hashing cimport WordTree
 cdef class Language:
@ -37,8 +36,6 @@ cdef class Language:
    cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL
    cdef Lexeme* new_lexeme(self, unicode lex) except NULL
    cdef Orthography* new_orth(self, unicode lex) except NULL
    cdef Distribution* new_dist(self, unicode lex) except NULL
    cdef unicode unhash(self, StringHash hashed)
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@ -13,15 +13,19 @@ from spacy.string_tools cimport substr
 from . import util
 from os import path
 DIST_FLAGS = {}
 TAGS = {}
-def get_normalized(unicode lex, size_t length):
+
 def get_normalized(unicode lex):
    if lex.isalpha() and lex.islower():
        return lex
    else:
-        return get_word_shape(lex, length)
+        return get_word_shape(lex)
-def get_word_shape(unicode lex, length):
+def get_word_shape(unicode lex):
    cdef size_t length = len(lex)
    shape = ""
    last = ""
    shape_char = ""
@ -47,7 +51,7 @@ def get_word_shape(unicode lex, length):
    return shape
-def set_orth_flags(lex, length):
+def set_orth_flags(lex):
    return 0
@ -60,7 +64,7 @@ cdef class Language:
        self.chunks.set_empty_key(0)
        self.vocab.set_empty_key(0)
        self.load_tokenization(util.read_tokenization(name))
-        self.load_dist_info(util.read_dist_info(name))
+        #self.load_dist_info(util.read_dist_info(name))
    cdef Tokens tokenize(self, unicode string):
        cdef Lexeme** chunk
@ -106,39 +110,25 @@ cdef class Language:
    cdef Lexeme* new_lexeme(self, unicode string) except NULL:
        cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
        cdef bytes byte_string = string.encode('utf8')
        word.string = <char*>byte_string
        word.length = len(byte_string)
        word.orth.flags = set_orth_flags(string)
        cdef unicode norm = get_normalized(string)
        cdef unicode shape = get_word_shape(string)
        cdef unicode last3 = string[-3:]
        word.lex = hash(string)
        word.orth.norm = hash(norm)
        word.orth.shape = hash(shape)
        word.orth.last3 = hash(last3)
        self.bacov[word.lex] = string
-        word.orth = self.new_orth(string)
+        self.bacov[word.orth.norm] = norm
        self.bacov[word.orth.shape] = shape
        self.bacov[word.orth.last3] = last3
-        word.dist = <Distribution*>calloc(1, sizeof(Distribution))
+        self.vocab[hash(string)] = <size_t>word
        self.vocab[word.lex] = <size_t>word
        return word
    cdef Orthography* new_orth(self, unicode lex) except NULL:
        cdef unicode last3
        cdef unicode norm
        cdef unicode shape
        cdef int length 
        length = len(lex)
        orth = <Orthography*>calloc(1, sizeof(Orthography))
        orth.first = lex[0]
        orth.length = length
        orth.flags = set_orth_flags(lex, orth.length)
        orth.norm = hash(lex)
        last3 = substr(lex, length - 3, length, length)
        orth.last3 = hash(last3)
        norm = get_normalized(lex, length)
        orth.norm = hash(norm)
        shape = get_word_shape(lex, length)
        orth.shape = hash(shape)
        self.bacov[orth.last3] = last3
        self.bacov[orth.norm] = norm
        self.bacov[orth.shape] = shape
        return orth
    cdef unicode unhash(self, StringHash hash_value):
        '''Fetch a string from the reverse index, given its hash value.'''
        return self.bacov[hash_value]
@ -167,12 +157,12 @@ cdef class Language:
        cdef Lexeme* w
        for string, word_dist in dist_info.items():
            w = self.lookup(string)
-            w.prob = word_dist.prob
+            w.dist.prob = word_dist.prob
-            w.cluster = word_dist.cluster
+            w.dist.cluster = word_dist.cluster
            for flag in word_dist.flags:
-                w.flags |= lexeme.DIST_FLAGS[flag]
+                w.dist.flags |= DIST_FLAGS[flag]
            for tag in word_dist.tagdict:
-                w.tagdict |= lexeme.TAGS[tag]
+                w.dist.tagdict |= TAGS[tag]
 cdef inline bint _is_whitespace(Py_UNICODE c) nogil: