* Remove Lexeme struct, preparing to rename Word to Lexeme.

2025-01-27 01:34:30 +03:00 · 2014-08-24 19:24:42 +02:00 · 2014-08-24 19:24:42 +02:00 · 88095666dc
commit 88095666dc
parent ce59526011
6 changed files with 18 additions and 204 deletions
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -3,14 +3,13 @@ from libcpp.vector cimport vector
 from spacy.spacy cimport StringHash
 from spacy.spacy cimport Language
-from spacy.word cimport Word
+from spacy.word cimport LatinWord
 from spacy.tokens cimport Tokens
 cimport cython
 cdef class English(spacy.Language):
    cdef int find_split(self, unicode word)
-    cdef int set_orth(self, unicode word, Word lex) except -1
+    cdef LatinWord new_lexeme(self, unicode string)
 cdef English EN
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -44,6 +44,9 @@ cimport spacy
 cdef class English(spacy.Language):
    cdef LatinWord new_lexeme(self, unicode string):
        return LatinWord(string)
    cdef int find_split(self, unicode word):
        cdef size_t length = len(word)
        cdef int i = 0
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -1,40 +0,0 @@
 from libc.stdint cimport uint32_t
 from libc.stdint cimport uint64_t
 cimport cython
 ctypedef int ClusterID
 ctypedef uint32_t StringHash
 ctypedef size_t LexID
 ctypedef char OrthFlags
 ctypedef char DistFlags
 ctypedef uint64_t TagFlags
 cdef struct Lexeme:
    StringHash lex
    char* string
    size_t length
    double prob
    ClusterID cluster
    TagFlags possible_tags
    DistFlags dist_flags
    OrthFlags orth_flags
    StringHash* string_views
 cpdef StringHash lex_of(LexID lex_id) except 0
 cpdef char first_of(LexID lex_id) except 0
 cpdef size_t length_of(LexID lex_id) except 0
 cpdef double prob_of(LexID lex_id) except 1
 cpdef ClusterID cluster_of(LexID lex_id) except 0
 cpdef bint is_often_titled(size_t lex_id)
 cpdef bint is_often_uppered(size_t lex_id)
 cpdef bint can_tag(LexID lex, TagFlags flag) except *
 cpdef bint check_dist_flag(LexID lex, DistFlags flag) except *
 cpdef bint check_orth_flag(LexID lex, OrthFlags flag) except *
 cpdef StringHash view_of(LexID lex_id, size_t view) except 0
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -1,155 +0,0 @@
 # cython: profile=True
 # cython: embedsignature=True
 '''Accessors for Lexeme properties, given a lex_id, which is cast to a Lexeme*.
 Mostly useful from Python-space. From Cython-space, you can just cast to
 Lexeme* yourself.
 '''
 from __future__ import unicode_literals
 from libc.stdlib cimport malloc, calloc, free
 from libc.stdint cimport uint64_t
 from spacy.spacy cimport StringHash
 # Python-visible enum for POS tags
 PUNCT = 0
 CONJ = 1
 NUM = 2
 X = 3
 DET = 4
 ADP = 5
 ADJ = 6
 ADV = 7
 VERB = 8
 NOUN = 9
 PDT = 10
 POS = 11
 PRON = 12
 PRT = 13
 cpdef int set_flags(LexID lex_id, object active_flags) except *:
    """Set orthographic bit flags for a Lexeme.
    Args:
        lex_id (LexemeID): A reference ID for a Lexeme.
        active_flags: A sequence of bits to set as True.
    """
    cdef size_t flag
    cdef Lexeme* w = <Lexeme*>lex_id
    for flag in active_flags:
        w.orth_flags |= 1 << flag
 cpdef StringHash view_of(LexID lex_id, size_t view) except 0:
    return (<Lexeme*>lex_id).string_views[view]
 cpdef StringHash lex_of(LexID lex_id) except 0:
    '''Access a hash of the word's string.
    >>> lex_of(lookup(u'Hi')) == hash(u'Hi')
    True
    '''
    return (<Lexeme*>lex_id).lex
 cpdef ClusterID cluster_of(LexID lex_id) except 0:
    '''Access an integer representation of the word's Brown cluster.
    A Brown cluster is an address into a binary tree, which gives some (noisy)
    information about the word's distributional context.
    >>> strings = (u'pineapple', u'apple', u'dapple', u'scalable')
    >>> token_ids = [lookup(s) for s in strings]
    >>> clusters = [cluster_of(t) for t in token_ids]
    >>> print ["{0:b"} % cluster_of(t) for t in token_ids]
    ["100111110110", "100111100100", "01010111011001", "100111110110"]
    The clusterings are unideal, but often slightly useful.
    "pineapple" and "apple" share a long prefix, indicating a similar meaning,
    while "dapple" is totally different. On the other hand, "scalable" receives
    the same cluster ID as "pineapple", which is not what we'd like.
    '''
    return (<Lexeme*>lex_id).cluster
 cpdef char first_of(size_t lex_id) except 0:
    '''Access the first byte of a utf8 encoding of the word.
    >>> lex_id = lookup(u'Hello')
    >>> chr(first_of(lex_id))
    'H'
    '''
    return (<Lexeme*>lex_id).string[0]
 cpdef size_t length_of(size_t lex_id) except 0:
    '''Access the (unicode) length of the word.
    '''
    cdef Lexeme* word = <Lexeme*>lex_id
    return word.length
 cpdef double prob_of(size_t lex_id) except 1:
    '''Access an estimate of the word's unigram log probability.
    Probabilities are calculated from a large text corpus, and smoothed using
    simple Good-Turing.  Estimates are read from data/en/probabilities, and
    can be replaced using spacy.en.load_probabilities.
    >>> prob_of(lookup(u'world'))
    -20.10340371976182
    '''
    return (<Lexeme*>lex_id).prob
 DEF OFT_UPPER = 1
 DEF OFT_TITLE = 2
 cpdef bint is_often_uppered(size_t lex_id):
    '''Check the OFT_UPPER distributional flag for the word.
    The OFT_UPPER flag records whether a lower-cased version of the word
    is found in all-upper case frequently in a large sample of text, where
    "frequently" is defined as P >= 0.95 (chosen for high mutual information for
    POS tagging).
    Case statistics are estimated from a large text corpus. Estimates are read
    from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
    >>> is_often_uppered(lookup(u'nato'))
    True
    >>> is_often_uppered(lookup(u'the')) 
    False
    '''
    return (<Lexeme*>lex_id).dist_flags & (1 << OFT_UPPER)
 cpdef bint is_often_titled(size_t lex_id):
    '''Check the OFT_TITLE distributional flag for the word.
    The OFT_TITLE flag records whether a lower-cased version of the word
    is found title-cased (see string.istitle) frequently in a large sample of text,
    where "frequently" is defined as P >= 0.3 (chosen for high mutual information for
    POS tagging).
    Case statistics are estimated from a large text corpus. Estimates are read
    from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
    >>> is_oft_upper(lookup(u'john'))
    True
    >>> is_oft_upper(lookup(u'Bill')) 
    False
    '''
    return (<Lexeme*>lex_id).dist_flags & (1 << OFT_TITLE)
 cpdef bint check_orth_flag(size_t lex_id, OrthFlags flag) except *:
    return (<Lexeme*>lex_id).orth_flags & (1 << flag)
 cpdef bint check_dist_flag(size_t lex_id, DistFlags flag) except *:
    return (<Lexeme*>lex_id).dist_flags & (1 << flag)
 cpdef bint can_tag(LexID lex_id, TagFlags flag) except *:
    return (<Lexeme*>lex_id).possible_tags & (1 << flag)
--- a/spacy/word.pxd
+++ b/spacy/word.pxd
@ -29,18 +29,19 @@ cdef enum:
 cdef class Word:
    # NB: the readonly keyword refers to _Python_ access. The attributes are
    # writeable from Cython.
-    cdef readonly StringHash lex
+    cdef readonly StringHash key
-    cdef readonly char* string
+    cdef readonly char** utf8_strings
    cdef readonly size_t length
    cdef readonly double prob
    cdef readonly ClusterID cluster
    cdef readonly TagFlags possible_tags
    cdef readonly DistFlags dist_flags
    cdef readonly OrthFlags orth_flags
    cdef StringHash* string_views
    cpdef StringHash get_view(self, size_t i) except 0
 cdef class CasedWord(Word):
    cpdef bint can_tag(self, TagFlags flag) except *
    cpdef bint check_dist_flag(self, DistFlags flag) except *
    cpdef bint check_orth_flag(self, OrthFlags flag) except *
--- a/spacy/word.pyx
+++ b/spacy/word.pyx
@ -60,11 +60,11 @@ cdef class Word:
            while "dapple" is totally different. On the other hand, "scalable" receives
            the same cluster ID as "pineapple", which is not what we'd like.
    """
-    def __cinit__(self, bytes string, list string_views):
+    def __cinit__(self, bytes string, list string_views, prob=0.0, cluster=0,
                  orth_flags=0, dist_flags=0, possible_tags=0):
        self.string = <char*>string
        self.length = len(string)
-        self.lex = hash(string)
+        self.views = <char**>calloc(len(string_views), sizeof(StringHash))
        self.string_views = <StringHash*>calloc(len(string_views), sizeof(StringHash))
        cdef unicode view
        for i in range(len(string_views)):
            view = string_views[i]
@ -98,6 +98,12 @@ cdef class Word:
        corpus. "Often" is chosen by heuristic.
        """
        return self.possible_tags & (1 << flag)
 cdef class CasedWord(Word):
    def __cinit__(self, bytes string):
        string_views = [get_normaized(string), get_word_shape(string), string[-3:]]
        Word.__cinit__(self, string, string_views)
    cpdef bint is_often_uppered(self) except *:
        '''Check the OFT_UPPER distributional flag for the word.