* Add Word classes

2025-07-14 18:22:27 +03:00 · 2014-08-24 18:14:08 +02:00 · 2014-08-24 18:14:08 +02:00 · ce59526011
commit ce59526011
parent 3b793cf4f7
2 changed files with 297 additions and 0 deletions
--- a/spacy/word.pxd
+++ b/spacy/word.pxd
@ -0,0 +1,58 @@
+from libc.stdint cimport uint32_t
+from libc.stdint cimport uint64_t
+
+ctypedef int ClusterID
+ctypedef uint32_t StringHash
+ctypedef size_t LexID
+ctypedef char OrthFlags
+ctypedef char DistFlags
+ctypedef uint64_t TagFlags
+
+
+cdef enum OrthFlag:
+    IS_ALPHA
+    IS_DIGIT
+    IS_PUNCT
+    IS_SPACE
+    IS_LOWER
+    IS_UPPER
+    IS_TITLE
+    IS_ASCII
+
+
+cdef enum:
+    NORM
+    SHAPE
+    LAST3
+
+
+cdef class Word:
+    # NB: the readonly keyword refers to _Python_ access. The attributes are
+    # writeable from Cython.
+    cdef readonly StringHash lex
+    cdef readonly char* string
+    cdef readonly size_t length
+    cdef readonly double prob
+    cdef readonly ClusterID cluster
+    cdef readonly TagFlags possible_tags
+    cdef readonly DistFlags dist_flags
+    cdef readonly OrthFlags orth_flags
+    cdef StringHash* string_views
+
+    cpdef StringHash get_view(self, size_t i) except 0
+
+    cpdef bint can_tag(self, TagFlags flag) except *
+    cpdef bint check_dist_flag(self, DistFlags flag) except *
+    cpdef bint check_orth_flag(self, OrthFlags flag) except *
+
+    cpdef bint is_often_titled(self) except *
+    cpdef bint is_often_uppered(self) except *
+
+    cpdef bint is_alpha(self) except *
+    cpdef bint is_digit(self) except *
+    cpdef bint is_punct(self) except *
+    cpdef bint is_space(self) except *
+    cpdef bint is_lower(self) except *
+    cpdef bint is_upper(self) except *
+    cpdef bint is_title(self) except *
+    cpdef bint is_ascii(self) except *
--- a/spacy/word.pyx
+++ b/spacy/word.pyx
@ -0,0 +1,239 @@
+# cython: profile=True
+# cython: embedsignature=True
+
+
+from libc.stdlib cimport calloc, free
+
+
+# Python-visible enum for POS tags
+PUNCT = 0
+CONJ = 1
+NUM = 2
+X = 3
+DET = 4
+ADP = 5
+ADJ = 6
+ADV = 7
+VERB = 8
+NOUN = 9
+PDT = 10
+POS = 11
+PRON = 12
+PRT = 13
+
+
+DEF OFT_UPPER = 1
+DEF OFT_TITLE = 2
+
+
+cdef class Word:
+    """A lexical type.
+
+    Attributes:
+        string (bytes):
+            A utf8-encoded byte-string for the word.
+        
+        lex (StringHash):
+            A hash of the word.
+        length (size_t):
+            The (unicode) length of the word.
+        
+        prob (double):
+            An estimate of the word's unigram log probability.
+
+            Probabilities are calculated from a large text corpus, and smoothed using
+            simple Good-Turing.  Estimates are read from data/en/probabilities, and
+            can be replaced using spacy.en.load_probabilities.
+        
+        cluster (int):
+            An integer representation of the word's Brown cluster.
+
+            A Brown cluster is an address into a binary tree, which gives some (noisy)
+            information about the word's distributional context.
+    
+            >>> strings = (u'pineapple', u'apple', u'dapple', u'scalable')
+            >>> print ["{0:b"} % lookup(s).cluster for s in strings]
+            ["100111110110", "100111100100", "01010111011001", "100111110110"]
+
+            The clusterings are unideal, but often slightly useful.
+            "pineapple" and "apple" share a long prefix, indicating a similar meaning,
+            while "dapple" is totally different. On the other hand, "scalable" receives
+            the same cluster ID as "pineapple", which is not what we'd like.
+    """
+    def __cinit__(self, bytes string, list string_views):
+        self.string = <char*>string
+        self.length = len(string)
+        self.lex = hash(string)
+        self.string_views = <StringHash*>calloc(len(string_views), sizeof(StringHash))
+        cdef unicode view
+        for i in range(len(string_views)):
+            view = string_views[i]
+            self.string_views[i] = hash(view)
+
+    def __dealloc__(self):
+        free(self.string_views)
+
+    cpdef StringHash get_view(self, size_t i) except 0:
+        return self.string_views[i]
+
+    cpdef bint check_orth_flag(self, OrthFlags flag) except *:
+        """Access the value of one of the pre-computed boolean orthographic features.
+
+        Meanings depend on the language-specific orthographic features being loaded.
+        The suggested features for latin-alphabet languages are: TODO
+        """
+        return self.orth_flags & (1 << flag)
+
+    cpdef bint check_dist_flag(self, DistFlags flag) except *:
+        """Access the value of one of the pre-computed boolean distribution features.
+
+        Meanings depend on the language-specific distributional features being loaded.
+        The suggested features for latin-alphabet languages are: TODO
+        """
+ 
+        return self.dist_flags & (1 << flag)
+
+    cpdef bint can_tag(self, TagFlags flag) except *:
+        """Check whether the word often receives a particular tag in a large text
+        corpus. "Often" is chosen by heuristic.
+        """
+        return self.possible_tags & (1 << flag)
+    
+    cpdef bint is_often_uppered(self) except *:
+        '''Check the OFT_UPPER distributional flag for the word.
+    
+        The OFT_UPPER flag records whether a lower-cased version of the word
+        is found in all-upper case frequently in a large sample of text, where
+        "frequently" is defined as P >= 0.95 (chosen for high mutual information for
+        POS tagging).
+    
+        Case statistics are estimated from a large text corpus. Estimates are read
+        from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
+    
+        >>> is_often_uppered(lookup(u'nato'))
+        True
+        >>> is_often_uppered(lookup(u'the')) 
+        False
+        '''
+        return self.dist_flags & (1 << OFT_UPPER)
+
+
+    cpdef bint is_often_titled(self) except *:
+        '''Check the OFT_TITLE distributional flag for the word.
+    
+        The OFT_TITLE flag records whether a lower-cased version of the word
+        is found title-cased (see string.istitle) frequently in a large sample of text,
+        where "frequently" is defined as P >= 0.3 (chosen for high mutual information for
+        POS tagging).
+    
+        Case statistics are estimated from a large text corpus. Estimates are read
+        from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
+    
+        >>> is_oft_upper(lookup(u'john'))
+        True
+        >>> is_oft_upper(lookup(u'Bill')) 
+        False
+        '''
+        return self.dist_flags & (1 << OFT_TITLE)
+
+
+    cpdef bint is_alpha(self) except *:
+        """Check whether all characters in the word's string are alphabetic.
+        
+        Should match the :py:func:`unicode.isalpha()` function.
+
+        >>> is_alpha(lookup(u'Hello'))
+        True
+        >>> is_alpha(lookup(u'العرب'))
+        True
+        >>> is_alpha(lookup(u'10'))
+        False
+        """
+        return self.orth_flags & 1 << IS_ALPHA
+
+    cpdef bint is_digit(self) except *:
+        """Check whether all characters in the word's string are numeric.
+    
+        Should match the :py:func:`unicode.isdigit()` function.
+
+        >>> is_digit(lookup(u'10'))
+        True
+        >>> is_digit(lookup(u'๐'))
+        True
+        >>> is_digit(lookup(u'one'))
+        False
+        """
+        return self.orth_flags & 1 << IS_DIGIT
+
+    cpdef bint is_punct(self) except *:
+        """Check whether all characters belong to a punctuation unicode data category
+        for a Lexeme ID.
+
+        >>> is_punct(lookup(u'.'))
+        True
+        >>> is_punct(lookup(u'⁒'))
+        True
+        >>> is_punct(lookup(u' '))
+        False
+        """
+        return self.orth_flags & 1 << IS_PUNCT
+
+    cpdef bint is_space(self) except *:
+        """Give the result of unicode.isspace() for a Lexeme ID.
+
+        >>> is_space(lookup(u'\\t'))
+        True
+        >>> is_space(lookup(u'<unicode space>'))
+        True
+        >>> is_space(lookup(u'Hi\\n'))
+        False
+        """
+        return self.orth_flags & 1 << IS_SPACE
+
+    cpdef bint is_lower(self) except *:
+        """Give the result of unicode.islower() for a Lexeme ID.
+
+        >>> is_lower(lookup(u'hi'))
+        True
+        >>> is_lower(lookup(<unicode>))
+        True
+        >>> is_lower(lookup(u'10'))
+        False
+        """
+        return self.orth_flags & 1 << IS_LOWER
+
+    cpdef bint is_upper(self) except *:
+        """Give the result of unicode.isupper() for a Lexeme ID.
+
+        >>> is_upper(lookup(u'HI'))
+        True
+        >>> is_upper(lookup(u'H10'))
+        True
+        >>> is_upper(lookup(u'10'))
+        False
+        """
+        return self.orth_flags & 1 << IS_UPPER
+
+    cpdef bint is_title(self) except *:
+        """Give the result of unicode.istitle() for a Lexeme ID.
+
+        >>> is_title(lookup(u'Hi'))
+        True
+        >>> is_title(lookup(u'Hi1'))
+        True
+        >>> is_title(lookup(u'1'))
+        False
+        """
+        return self.orth_flags & 1 << IS_TITLE
+
+    cpdef bint is_ascii(self) except *:
+        """Give the result of checking whether all characters in the string are ascii.
+
+        >>> is_ascii(lookup(u'Hi'))
+        True
+        >>> is_ascii(lookup(u' '))
+        True
+        >>> is_title(lookup(u'<unicode>'))
+        False
+        """
+        return self.orth_flags & 1 << IS_ASCII