* Reforming data model for lexemes

2025-09-16 00:52:38 +03:00 · 2014-08-19 02:40:37 +02:00 · 2014-08-19 02:40:37 +02:00 · 3379d7a571
commit 3379d7a571
parent e091b6a241
5 changed files with 90 additions and 67 deletions
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -64,3 +64,33 @@ cpdef Lexeme_addr lookup(unicode string) except 0:

 cpdef unicode unhash(StringHash hash_value):
    return EN.unhash(hash_value)
+
+
+cpdef bint is_oft_upper(size_t lex_id):
+    '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
+    stores whether the lowered version of the string hashed by `lex' is found
+    in all-upper case frequently in a large sample of text.  Users are free
+    to load different data, by default we use a sample from Wikipedia, with
+    a threshold of 0.95, picked to maximize mutual information for POS tagging.
+
+    >>> is_oft_upper(lookup(u'abc'))
+    True
+    >>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
+    True
+    '''
+    return (<Lexeme*>lex_id).dist.flags & OFT_UPPER
+
+
+cpdef bint is_oft_title(size_t lex_id):
+    '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
+    stores whether the lowered version of the string hashed by `lex' is found
+    title-cased frequently in a large sample of text.  Users are free
+    to load different data, by default we use a sample from Wikipedia, with
+    a threshold of 0.3, picked to maximize mutual information for POS tagging.
+
+    >>> is_oft_title(lookup(u'marcus'))
+    True
+    >>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
+    True
+    '''
+    return (<Lexeme*>lex_id).dist.flags & OFT_TITLE
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -9,13 +9,35 @@ ctypedef char Bits8
 ctypedef uint64_t Bits64


+cdef enum OrthFlag:
+    IS_ALPHA
+    IS_DIGIT
+    IS_PUNCT
+    IS_WHITE
+    IS_LOWER
+    IS_UPPER
+    IS_TITLE
+    IS_ASCII
+
+
+cdef enum DistFlag:
+    OFT_UPPER
+    OFT_TITLE
+    DIST_FLAG3
+    DIST_FLAG4
+    DIST_FLAG5
+    DIST_FLAG6
+    DIST_FLAG7
+    DIST_FLAG8
+
+
 cdef struct Orthography:
    StringHash last3
    StringHash shape
    StringHash norm

    size_t length
-    unsigned char first
+    Py_UNICODE first
    Bits8 flags


@ -27,15 +49,12 @@ cdef struct Distribution:


 cdef struct Lexeme:
-    StringHash sic # Hash of the original string
-    StringHash lex # Hash of the word, with punctuation and clitics split off
-
-    Distribution* dist # Distribution info, lazy loaded
+    StringHash lex # Hash of the word
    Orthography* orth  # Extra orthographic views
-    #Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens
+    Distribution* dist # Distribution info


-cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL)
+cdef Lexeme BLANK_WORD = Lexeme(0, NULL, NULL)


 cdef enum StringAttr:
@ -49,7 +68,16 @@ cdef enum StringAttr:
 cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0

 cpdef StringHash lex_of(size_t lex_id) except 0
+
 cpdef StringHash norm_of(size_t lex_id) except 0
 cpdef StringHash shape_of(size_t lex_id) except 0
 cpdef StringHash last3_of(size_t lex_id) except 0
-cpdef StringHash length_of(size_t lex_id)
+
+cpdef size_t length_of(size_t lex_id) except *
+cpdef Py_UNICODE first_of(size_t lex_id) except *
+
+cpdef double prob_of(size_t lex_id) except 0
+cpdef ClusterID cluster_of(size_t lex_id) except 0
+
+cpdef bint check_orth_flag(size_t lex, OrthFlag flag) except *
+cpdef bint check_dist_flag(size_t lex, DistFlag flag) except *
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -13,13 +13,6 @@ from libcpp.vector cimport vector

 from spacy.spacy cimport StringHash

-# Reiterate the enum, for python
-#SIC = StringAttr.sic
-#LEX = StringAttr.lex
-#NORM = StringAttr.norm
-#SHAPE = StringAttr.shape
-#LAST3 = StringAttr.last3
-

 cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0:
    if attr == LEX:
@ -133,37 +126,9 @@ cpdef double prob_of(size_t lex_id):
    return (<Lexeme*>lex_id).dist.prob


-cpdef bint is_oft_upper(size_t lex_id):
-    '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
-    stores whether the lowered version of the string hashed by `lex' is found
-    in all-upper case frequently in a large sample of text.  Users are free
-    to load different data, by default we use a sample from Wikipedia, with
-    a threshold of 0.95, picked to maximize mutual information for POS tagging.
-
-    >>> is_oft_upper(lookup(u'abc'))
-    True
-    >>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
-    True
-    '''
-    return False
-    #cdef Lexeme* w = <Lexeme*>lex_id
-    #return w.orth.last3 if w.orth != NULL else 0
+cpdef bint check_orth_flag(size_t lex, OrthFlag flag) except *:
+    return (<Lexeme*>lex_id).orth.flags & (1 << flag)


-    #return (<Lexeme*>lex_id).oft_upper
-
-
-cpdef bint is_oft_title(size_t lex_id):
-    '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
-    stores whether the lowered version of the string hashed by `lex' is found
-    title-cased frequently in a large sample of text.  Users are free
-    to load different data, by default we use a sample from Wikipedia, with
-    a threshold of 0.3, picked to maximize mutual information for POS tagging.
-
-    >>> is_oft_title(lookup(u'marcus'))
-    True
-    >>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
-    True
-    '''
-    return False
-    #return (<Lexeme*>lex_id).oft_title
+cpdef bint check_dist_flag(size_t lex, DistFlag flag) except *:
+    return (<Lexeme*>lex_id).dist.flags & (1 << flag)
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@ -60,6 +60,7 @@ cdef class Language:
        self.chunks.set_empty_key(0)
        self.vocab.set_empty_key(0)
        self.load_tokenization(util.read_tokenization(name))
+        self.load_dist_info(util.read_dist_info(name))

    cdef Tokens tokenize(self, unicode string):
        cdef Lexeme** chunk
@ -108,7 +109,8 @@ cdef class Language:
        word.lex = hash(string)
        self.bacov[word.lex] = string
        word.orth = self.new_orth(string)
-        word.dist = self.new_dist(string)
+
+        word.dist = <Distribution*>calloc(1, sizeof(Distribution))
        self.vocab[word.lex] = <size_t>word
        return word

@ -135,13 +137,8 @@ cdef class Language:
        self.bacov[orth.last3] = last3
        self.bacov[orth.norm] = norm
        self.bacov[orth.shape] = shape
-
        return orth

-    cdef Distribution* new_dist(self, unicode lex) except NULL:
-        dist = <Distribution*>calloc(1, sizeof(Distribution))
-        return dist
-
    cdef unicode unhash(self, StringHash hash_value):
        '''Fetch a string from the reverse index, given its hash value.'''
        return self.bacov[hash_value]
@ -164,21 +161,18 @@ cdef class Language:
        for chunk, tokens in token_rules:
            self.new_chunk(chunk, tokens)

-    def load_clusters(self):
+    def load_dist_info(self, dist_info):
+        cdef unicode string
+        cdef dict word_dist
        cdef Lexeme* w
-        data_dir = path.join(path.dirname(__file__), '..', 'data', 'en')
-        case_stats = util.load_case_stats(data_dir)
-        brown_loc = path.join(data_dir, 'clusters')
-        cdef size_t start 
-        cdef int end 
-        with util.utf8open(brown_loc) as browns_file:
-            for i, line in enumerate(browns_file):
-                cluster_str, token_string, freq_str = line.split()
-                # Decode as a little-endian string, so that we can do & 15 to get
-                # the first 4 bits. See redshift._parse_features.pyx
-                cluster = int(cluster_str[::-1], 2)
-                upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0))
-                self.new_lexeme(token_string)
+        for string, word_dist in dist_info.items():
+            w = self.lookup(string)
+            w.prob = word_dist.prob
+            w.cluster = word_dist.cluster
+            for flag in word_dist.flags:
+                w.flags |= lexeme.DIST_FLAGS[flag]
+            for tag in word_dist.tagdict:
+                w.tagdict |= lexeme.TAGS[tag]


 cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
--- a/spacy/util.py
+++ b/spacy/util.py
@ -19,6 +19,12 @@ def load_case_stats(data_dir):
    return case_stats


+def load_dist_info(lang):
+    with path.join(DATA_DIR, lang, 'distribution_info.json') as file_:
+        dist_info = json.load(file_)
+    return dist_info
+
+
 def read_tokenization(lang):
    loc = path.join(DATA_DIR, lang, 'tokenization')
    entries = []