* Reforming data model for lexemes

2025-07-09 15:52:31 +03:00 · 2014-08-19 02:40:37 +02:00 · 2014-08-19 02:40:37 +02:00 · 3379d7a571
commit 3379d7a571
parent e091b6a241
5 changed files with 90 additions and 67 deletions
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -64,3 +64,33 @@ cpdef Lexeme_addr lookup(unicode string) except 0:
 cpdef unicode unhash(StringHash hash_value):
    return EN.unhash(hash_value)
 cpdef bint is_oft_upper(size_t lex_id):
    '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
    stores whether the lowered version of the string hashed by `lex' is found
    in all-upper case frequently in a large sample of text.  Users are free
    to load different data, by default we use a sample from Wikipedia, with
    a threshold of 0.95, picked to maximize mutual information for POS tagging.
    >>> is_oft_upper(lookup(u'abc'))
    True
    >>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
    True
    '''
    return (<Lexeme*>lex_id).dist.flags & OFT_UPPER
 cpdef bint is_oft_title(size_t lex_id):
    '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
    stores whether the lowered version of the string hashed by `lex' is found
    title-cased frequently in a large sample of text.  Users are free
    to load different data, by default we use a sample from Wikipedia, with
    a threshold of 0.3, picked to maximize mutual information for POS tagging.
    >>> is_oft_title(lookup(u'marcus'))
    True
    >>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
    True
    '''
    return (<Lexeme*>lex_id).dist.flags & OFT_TITLE
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -9,13 +9,35 @@ ctypedef char Bits8
 ctypedef uint64_t Bits64
 cdef enum OrthFlag:
    IS_ALPHA
    IS_DIGIT
    IS_PUNCT
    IS_WHITE
    IS_LOWER
    IS_UPPER
    IS_TITLE
    IS_ASCII
 cdef enum DistFlag:
    OFT_UPPER
    OFT_TITLE
    DIST_FLAG3
    DIST_FLAG4
    DIST_FLAG5
    DIST_FLAG6
    DIST_FLAG7
    DIST_FLAG8
 cdef struct Orthography:
    StringHash last3
    StringHash shape
    StringHash norm
    size_t length
-    unsigned char first
+    Py_UNICODE first
    Bits8 flags
@ -27,15 +49,12 @@ cdef struct Distribution:
 cdef struct Lexeme:
-    StringHash sic # Hash of the original string
+    StringHash lex # Hash of the word
    StringHash lex # Hash of the word, with punctuation and clitics split off
    Distribution* dist # Distribution info, lazy loaded
    Orthography* orth  # Extra orthographic views
-    #Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens
+    Distribution* dist # Distribution info
-cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL)
+cdef Lexeme BLANK_WORD = Lexeme(0, NULL, NULL)
 cdef enum StringAttr:
@ -49,7 +68,16 @@ cdef enum StringAttr:
 cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0
 cpdef StringHash lex_of(size_t lex_id) except 0
 cpdef StringHash norm_of(size_t lex_id) except 0
 cpdef StringHash shape_of(size_t lex_id) except 0
 cpdef StringHash last3_of(size_t lex_id) except 0
-cpdef StringHash length_of(size_t lex_id)
+
 cpdef size_t length_of(size_t lex_id) except *
 cpdef Py_UNICODE first_of(size_t lex_id) except *
 cpdef double prob_of(size_t lex_id) except 0
 cpdef ClusterID cluster_of(size_t lex_id) except 0
 cpdef bint check_orth_flag(size_t lex, OrthFlag flag) except *
 cpdef bint check_dist_flag(size_t lex, DistFlag flag) except *
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -13,13 +13,6 @@ from libcpp.vector cimport vector
 from spacy.spacy cimport StringHash
 # Reiterate the enum, for python
 #SIC = StringAttr.sic
 #LEX = StringAttr.lex
 #NORM = StringAttr.norm
 #SHAPE = StringAttr.shape
 #LAST3 = StringAttr.last3
 cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0:
    if attr == LEX:
@ -133,37 +126,9 @@ cpdef double prob_of(size_t lex_id):
    return (<Lexeme*>lex_id).dist.prob
-cpdef bint is_oft_upper(size_t lex_id):
+cpdef bint check_orth_flag(size_t lex, OrthFlag flag) except *:
-    '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
+    return (<Lexeme*>lex_id).orth.flags & (1 << flag)
    stores whether the lowered version of the string hashed by `lex' is found
    in all-upper case frequently in a large sample of text.  Users are free
    to load different data, by default we use a sample from Wikipedia, with
    a threshold of 0.95, picked to maximize mutual information for POS tagging.
    >>> is_oft_upper(lookup(u'abc'))
    True
    >>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
    True
    '''
    return False
    #cdef Lexeme* w = <Lexeme*>lex_id
    #return w.orth.last3 if w.orth != NULL else 0
-    #return (<Lexeme*>lex_id).oft_upper
+cpdef bint check_dist_flag(size_t lex, DistFlag flag) except *:
-
+    return (<Lexeme*>lex_id).dist.flags & (1 << flag)
 cpdef bint is_oft_title(size_t lex_id):
    '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
    stores whether the lowered version of the string hashed by `lex' is found
    title-cased frequently in a large sample of text.  Users are free
    to load different data, by default we use a sample from Wikipedia, with
    a threshold of 0.3, picked to maximize mutual information for POS tagging.
    >>> is_oft_title(lookup(u'marcus'))
    True
    >>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
    True
    '''
    return False
    #return (<Lexeme*>lex_id).oft_title
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@ -60,6 +60,7 @@ cdef class Language:
        self.chunks.set_empty_key(0)
        self.vocab.set_empty_key(0)
        self.load_tokenization(util.read_tokenization(name))
        self.load_dist_info(util.read_dist_info(name))
    cdef Tokens tokenize(self, unicode string):
        cdef Lexeme** chunk
@ -108,7 +109,8 @@ cdef class Language:
        word.lex = hash(string)
        self.bacov[word.lex] = string
        word.orth = self.new_orth(string)
-        word.dist = self.new_dist(string)
+
        word.dist = <Distribution*>calloc(1, sizeof(Distribution))
        self.vocab[word.lex] = <size_t>word
        return word
@ -135,13 +137,8 @@ cdef class Language:
        self.bacov[orth.last3] = last3
        self.bacov[orth.norm] = norm
        self.bacov[orth.shape] = shape
        return orth
    cdef Distribution* new_dist(self, unicode lex) except NULL:
        dist = <Distribution*>calloc(1, sizeof(Distribution))
        return dist
    cdef unicode unhash(self, StringHash hash_value):
        '''Fetch a string from the reverse index, given its hash value.'''
        return self.bacov[hash_value]
@ -164,21 +161,18 @@ cdef class Language:
        for chunk, tokens in token_rules:
            self.new_chunk(chunk, tokens)
-    def load_clusters(self):
+    def load_dist_info(self, dist_info):
        cdef unicode string
        cdef dict word_dist
        cdef Lexeme* w
-        data_dir = path.join(path.dirname(__file__), '..', 'data', 'en')
+        for string, word_dist in dist_info.items():
-        case_stats = util.load_case_stats(data_dir)
+            w = self.lookup(string)
-        brown_loc = path.join(data_dir, 'clusters')
+            w.prob = word_dist.prob
-        cdef size_t start 
+            w.cluster = word_dist.cluster
-        cdef int end 
+            for flag in word_dist.flags:
-        with util.utf8open(brown_loc) as browns_file:
+                w.flags |= lexeme.DIST_FLAGS[flag]
-            for i, line in enumerate(browns_file):
+            for tag in word_dist.tagdict:
-                cluster_str, token_string, freq_str = line.split()
+                w.tagdict |= lexeme.TAGS[tag]
                # Decode as a little-endian string, so that we can do & 15 to get
                # the first 4 bits. See redshift._parse_features.pyx
                cluster = int(cluster_str[::-1], 2)
                upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0))
                self.new_lexeme(token_string)
 cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
--- a/spacy/util.py
+++ b/spacy/util.py
@ -19,6 +19,12 @@ def load_case_stats(data_dir):
    return case_stats
 def load_dist_info(lang):
    with path.join(DATA_DIR, lang, 'distribution_info.json') as file_:
        dist_info = json.load(file_)
    return dist_info
 def read_tokenization(lang):
    loc = path.join(DATA_DIR, lang, 'tokenization')
    entries = []