* Remove Lexeme struct, preparing to rename Word to Lexeme.

2026-02-01 21:16:05 +03:00 · 2014-08-24 19:24:42 +02:00 · 2014-08-24 19:24:42 +02:00 · 88095666dc
commit 88095666dc
parent ce59526011
6 changed files with 18 additions and 204 deletions
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -3,14 +3,13 @@ from libcpp.vector cimport vector
 from spacy.spacy cimport StringHash

 from spacy.spacy cimport Language
-from spacy.word cimport Word
-from spacy.tokens cimport Tokens
+from spacy.word cimport LatinWord
 cimport cython


 cdef class English(spacy.Language):
    cdef int find_split(self, unicode word)
-    cdef int set_orth(self, unicode word, Word lex) except -1
+    cdef LatinWord new_lexeme(self, unicode string)


 cdef English EN
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -44,6 +44,9 @@ cimport spacy


 cdef class English(spacy.Language):
+    cdef LatinWord new_lexeme(self, unicode string):
+        return LatinWord(string)
+
    cdef int find_split(self, unicode word):
        cdef size_t length = len(word)
        cdef int i = 0
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -1,40 +0,0 @@
-from libc.stdint cimport uint32_t
-from libc.stdint cimport uint64_t
-cimport cython
-
-ctypedef int ClusterID
-ctypedef uint32_t StringHash
-ctypedef size_t LexID
-ctypedef char OrthFlags
-ctypedef char DistFlags
-ctypedef uint64_t TagFlags
-
-
-cdef struct Lexeme:
-    StringHash lex
-    char* string
-    size_t length
-    double prob
-    ClusterID cluster
-    TagFlags possible_tags
-    DistFlags dist_flags
-    OrthFlags orth_flags
-    StringHash* string_views
-
-
-cpdef StringHash lex_of(LexID lex_id) except 0
-cpdef char first_of(LexID lex_id) except 0
-cpdef size_t length_of(LexID lex_id) except 0
-cpdef double prob_of(LexID lex_id) except 1
-cpdef ClusterID cluster_of(LexID lex_id) except 0
-
-
-cpdef bint is_often_titled(size_t lex_id)
-cpdef bint is_often_uppered(size_t lex_id)
-
-
-cpdef bint can_tag(LexID lex, TagFlags flag) except *
-cpdef bint check_dist_flag(LexID lex, DistFlags flag) except *
-cpdef bint check_orth_flag(LexID lex, OrthFlags flag) except *
-
-cpdef StringHash view_of(LexID lex_id, size_t view) except 0
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -1,155 +0,0 @@
-# cython: profile=True
-# cython: embedsignature=True
-'''Accessors for Lexeme properties, given a lex_id, which is cast to a Lexeme*.
-Mostly useful from Python-space. From Cython-space, you can just cast to
-Lexeme* yourself.
-'''
-from __future__ import unicode_literals
-
-from libc.stdlib cimport malloc, calloc, free
-from libc.stdint cimport uint64_t
-
-from spacy.spacy cimport StringHash
-
-# Python-visible enum for POS tags
-PUNCT = 0
-CONJ = 1
-NUM = 2
-X = 3
-DET = 4
-ADP = 5
-ADJ = 6
-ADV = 7
-VERB = 8
-NOUN = 9
-PDT = 10
-POS = 11
-PRON = 12
-PRT = 13
-
-cpdef int set_flags(LexID lex_id, object active_flags) except *:
-    """Set orthographic bit flags for a Lexeme.
-
-    Args:
-        lex_id (LexemeID): A reference ID for a Lexeme.
-        active_flags: A sequence of bits to set as True.
-    """
-    cdef size_t flag
-    cdef Lexeme* w = <Lexeme*>lex_id
-    for flag in active_flags:
-        w.orth_flags |= 1 << flag
-
-
-cpdef StringHash view_of(LexID lex_id, size_t view) except 0:
-    return (<Lexeme*>lex_id).string_views[view]
-
-
-cpdef StringHash lex_of(LexID lex_id) except 0:
-    '''Access a hash of the word's string.
-
-    >>> lex_of(lookup(u'Hi')) == hash(u'Hi')
-    True
-    '''
-    return (<Lexeme*>lex_id).lex
-
-
-cpdef ClusterID cluster_of(LexID lex_id) except 0:
-    '''Access an integer representation of the word's Brown cluster.
-
-    A Brown cluster is an address into a binary tree, which gives some (noisy)
-    information about the word's distributional context.
-    
-    >>> strings = (u'pineapple', u'apple', u'dapple', u'scalable')
-    >>> token_ids = [lookup(s) for s in strings]
-    >>> clusters = [cluster_of(t) for t in token_ids]
-    >>> print ["{0:b"} % cluster_of(t) for t in token_ids]
-    ["100111110110", "100111100100", "01010111011001", "100111110110"]
-
-    The clusterings are unideal, but often slightly useful.
-    "pineapple" and "apple" share a long prefix, indicating a similar meaning,
-    while "dapple" is totally different. On the other hand, "scalable" receives
-    the same cluster ID as "pineapple", which is not what we'd like.
-    '''
-    return (<Lexeme*>lex_id).cluster
-
-
-cpdef char first_of(size_t lex_id) except 0:
-    '''Access the first byte of a utf8 encoding of the word.
-
-    >>> lex_id = lookup(u'Hello')
-    >>> chr(first_of(lex_id))
-    'H'
-    '''
-    return (<Lexeme*>lex_id).string[0]
-
-
-cpdef size_t length_of(size_t lex_id) except 0:
-    '''Access the (unicode) length of the word.
-    '''
-    cdef Lexeme* word = <Lexeme*>lex_id
-    return word.length
-
-
-cpdef double prob_of(size_t lex_id) except 1:
-    '''Access an estimate of the word's unigram log probability.
-
-    Probabilities are calculated from a large text corpus, and smoothed using
-    simple Good-Turing.  Estimates are read from data/en/probabilities, and
-    can be replaced using spacy.en.load_probabilities.
-    
-    >>> prob_of(lookup(u'world'))
-    -20.10340371976182
-    '''
-    return (<Lexeme*>lex_id).prob
-
-DEF OFT_UPPER = 1
-DEF OFT_TITLE = 2
-
-cpdef bint is_often_uppered(size_t lex_id):
-    '''Check the OFT_UPPER distributional flag for the word.
-    
-    The OFT_UPPER flag records whether a lower-cased version of the word
-    is found in all-upper case frequently in a large sample of text, where
-    "frequently" is defined as P >= 0.95 (chosen for high mutual information for
-    POS tagging).
-    
-    Case statistics are estimated from a large text corpus. Estimates are read
-    from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
-    
-    >>> is_often_uppered(lookup(u'nato'))
-    True
-    >>> is_often_uppered(lookup(u'the')) 
-    False
-    '''
-    return (<Lexeme*>lex_id).dist_flags & (1 << OFT_UPPER)
-
-
-cpdef bint is_often_titled(size_t lex_id):
-    '''Check the OFT_TITLE distributional flag for the word.
-    
-    The OFT_TITLE flag records whether a lower-cased version of the word
-    is found title-cased (see string.istitle) frequently in a large sample of text,
-    where "frequently" is defined as P >= 0.3 (chosen for high mutual information for
-    POS tagging).
-    
-    Case statistics are estimated from a large text corpus. Estimates are read
-    from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
-    
-    >>> is_oft_upper(lookup(u'john'))
-    True
-    >>> is_oft_upper(lookup(u'Bill')) 
-    False
-    '''
-    return (<Lexeme*>lex_id).dist_flags & (1 << OFT_TITLE)
-
-
-cpdef bint check_orth_flag(size_t lex_id, OrthFlags flag) except *:
-    return (<Lexeme*>lex_id).orth_flags & (1 << flag)
-
-
-cpdef bint check_dist_flag(size_t lex_id, DistFlags flag) except *:
-    return (<Lexeme*>lex_id).dist_flags & (1 << flag)
-
-
-cpdef bint can_tag(LexID lex_id, TagFlags flag) except *:
-    return (<Lexeme*>lex_id).possible_tags & (1 << flag)
--- a/spacy/word.pxd
+++ b/spacy/word.pxd
@ -29,18 +29,19 @@ cdef enum:
 cdef class Word:
    # NB: the readonly keyword refers to _Python_ access. The attributes are
    # writeable from Cython.
-    cdef readonly StringHash lex
-    cdef readonly char* string
+    cdef readonly StringHash key
+    cdef readonly char** utf8_strings
    cdef readonly size_t length
    cdef readonly double prob
    cdef readonly ClusterID cluster
    cdef readonly TagFlags possible_tags
    cdef readonly DistFlags dist_flags
    cdef readonly OrthFlags orth_flags
-    cdef StringHash* string_views

    cpdef StringHash get_view(self, size_t i) except 0

+
+cdef class CasedWord(Word):
    cpdef bint can_tag(self, TagFlags flag) except *
    cpdef bint check_dist_flag(self, DistFlags flag) except *
    cpdef bint check_orth_flag(self, OrthFlags flag) except *
--- a/spacy/word.pyx
+++ b/spacy/word.pyx
@ -60,11 +60,11 @@ cdef class Word:
            while "dapple" is totally different. On the other hand, "scalable" receives
            the same cluster ID as "pineapple", which is not what we'd like.
    """
-    def __cinit__(self, bytes string, list string_views):
+    def __cinit__(self, bytes string, list string_views, prob=0.0, cluster=0,
+                  orth_flags=0, dist_flags=0, possible_tags=0):
        self.string = <char*>string
        self.length = len(string)
-        self.lex = hash(string)
-        self.string_views = <StringHash*>calloc(len(string_views), sizeof(StringHash))
+        self.views = <char**>calloc(len(string_views), sizeof(StringHash))
        cdef unicode view
        for i in range(len(string_views)):
            view = string_views[i]
@ -98,6 +98,12 @@ cdef class Word:
        corpus. "Often" is chosen by heuristic.
        """
        return self.possible_tags & (1 << flag)
+
+
+cdef class CasedWord(Word):
+    def __cinit__(self, bytes string):
+        string_views = [get_normaized(string), get_word_shape(string), string[-3:]]
+        Word.__cinit__(self, string, string_views)
    
    cpdef bint is_often_uppered(self) except *:
        '''Check the OFT_UPPER distributional flag for the word.