* Add Word classes

2025-11-04 09:57:26 +03:00 · 2014-08-24 18:14:08 +02:00 · 2014-08-24 18:14:08 +02:00 · ce59526011
commit ce59526011
parent 3b793cf4f7
2 changed files with 297 additions and 0 deletions
--- a/spacy/word.pxd
+++ b/spacy/word.pxd
@ -0,0 +1,58 @@
 from libc.stdint cimport uint32_t
 from libc.stdint cimport uint64_t
 ctypedef int ClusterID
 ctypedef uint32_t StringHash
 ctypedef size_t LexID
 ctypedef char OrthFlags
 ctypedef char DistFlags
 ctypedef uint64_t TagFlags
 cdef enum OrthFlag:
    IS_ALPHA
    IS_DIGIT
    IS_PUNCT
    IS_SPACE
    IS_LOWER
    IS_UPPER
    IS_TITLE
    IS_ASCII
 cdef enum:
    NORM
    SHAPE
    LAST3
 cdef class Word:
    # NB: the readonly keyword refers to _Python_ access. The attributes are
    # writeable from Cython.
    cdef readonly StringHash lex
    cdef readonly char* string
    cdef readonly size_t length
    cdef readonly double prob
    cdef readonly ClusterID cluster
    cdef readonly TagFlags possible_tags
    cdef readonly DistFlags dist_flags
    cdef readonly OrthFlags orth_flags
    cdef StringHash* string_views
    cpdef StringHash get_view(self, size_t i) except 0
    cpdef bint can_tag(self, TagFlags flag) except *
    cpdef bint check_dist_flag(self, DistFlags flag) except *
    cpdef bint check_orth_flag(self, OrthFlags flag) except *
    cpdef bint is_often_titled(self) except *
    cpdef bint is_often_uppered(self) except *
    cpdef bint is_alpha(self) except *
    cpdef bint is_digit(self) except *
    cpdef bint is_punct(self) except *
    cpdef bint is_space(self) except *
    cpdef bint is_lower(self) except *
    cpdef bint is_upper(self) except *
    cpdef bint is_title(self) except *
    cpdef bint is_ascii(self) except *
--- a/spacy/word.pyx
+++ b/spacy/word.pyx
@ -0,0 +1,239 @@
 # cython: profile=True
 # cython: embedsignature=True
 from libc.stdlib cimport calloc, free
 # Python-visible enum for POS tags
 PUNCT = 0
 CONJ = 1
 NUM = 2
 X = 3
 DET = 4
 ADP = 5
 ADJ = 6
 ADV = 7
 VERB = 8
 NOUN = 9
 PDT = 10
 POS = 11
 PRON = 12
 PRT = 13
 DEF OFT_UPPER = 1
 DEF OFT_TITLE = 2
 cdef class Word:
    """A lexical type.
    Attributes:
        string (bytes):
            A utf8-encoded byte-string for the word.
        lex (StringHash):
            A hash of the word.
        length (size_t):
            The (unicode) length of the word.
        prob (double):
            An estimate of the word's unigram log probability.
            Probabilities are calculated from a large text corpus, and smoothed using
            simple Good-Turing.  Estimates are read from data/en/probabilities, and
            can be replaced using spacy.en.load_probabilities.
        cluster (int):
            An integer representation of the word's Brown cluster.
            A Brown cluster is an address into a binary tree, which gives some (noisy)
            information about the word's distributional context.
            >>> strings = (u'pineapple', u'apple', u'dapple', u'scalable')
            >>> print ["{0:b"} % lookup(s).cluster for s in strings]
            ["100111110110", "100111100100", "01010111011001", "100111110110"]
            The clusterings are unideal, but often slightly useful.
            "pineapple" and "apple" share a long prefix, indicating a similar meaning,
            while "dapple" is totally different. On the other hand, "scalable" receives
            the same cluster ID as "pineapple", which is not what we'd like.
    """
    def __cinit__(self, bytes string, list string_views):
        self.string = <char*>string
        self.length = len(string)
        self.lex = hash(string)
        self.string_views = <StringHash*>calloc(len(string_views), sizeof(StringHash))
        cdef unicode view
        for i in range(len(string_views)):
            view = string_views[i]
            self.string_views[i] = hash(view)
    def __dealloc__(self):
        free(self.string_views)
    cpdef StringHash get_view(self, size_t i) except 0:
        return self.string_views[i]
    cpdef bint check_orth_flag(self, OrthFlags flag) except *:
        """Access the value of one of the pre-computed boolean orthographic features.
        Meanings depend on the language-specific orthographic features being loaded.
        The suggested features for latin-alphabet languages are: TODO
        """
        return self.orth_flags & (1 << flag)
    cpdef bint check_dist_flag(self, DistFlags flag) except *:
        """Access the value of one of the pre-computed boolean distribution features.
        Meanings depend on the language-specific distributional features being loaded.
        The suggested features for latin-alphabet languages are: TODO
        """
        return self.dist_flags & (1 << flag)
    cpdef bint can_tag(self, TagFlags flag) except *:
        """Check whether the word often receives a particular tag in a large text
        corpus. "Often" is chosen by heuristic.
        """
        return self.possible_tags & (1 << flag)
    cpdef bint is_often_uppered(self) except *:
        '''Check the OFT_UPPER distributional flag for the word.
        The OFT_UPPER flag records whether a lower-cased version of the word
        is found in all-upper case frequently in a large sample of text, where
        "frequently" is defined as P >= 0.95 (chosen for high mutual information for
        POS tagging).
        Case statistics are estimated from a large text corpus. Estimates are read
        from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
        >>> is_often_uppered(lookup(u'nato'))
        True
        >>> is_often_uppered(lookup(u'the')) 
        False
        '''
        return self.dist_flags & (1 << OFT_UPPER)
    cpdef bint is_often_titled(self) except *:
        '''Check the OFT_TITLE distributional flag for the word.
        The OFT_TITLE flag records whether a lower-cased version of the word
        is found title-cased (see string.istitle) frequently in a large sample of text,
        where "frequently" is defined as P >= 0.3 (chosen for high mutual information for
        POS tagging).
        Case statistics are estimated from a large text corpus. Estimates are read
        from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
        >>> is_oft_upper(lookup(u'john'))
        True
        >>> is_oft_upper(lookup(u'Bill')) 
        False
        '''
        return self.dist_flags & (1 << OFT_TITLE)
    cpdef bint is_alpha(self) except *:
        """Check whether all characters in the word's string are alphabetic.
        Should match the :py:func:`unicode.isalpha()` function.
        >>> is_alpha(lookup(u'Hello'))
        True
        >>> is_alpha(lookup(u'العرب'))
        True
        >>> is_alpha(lookup(u'10'))
        False
        """
        return self.orth_flags & 1 << IS_ALPHA
    cpdef bint is_digit(self) except *:
        """Check whether all characters in the word's string are numeric.
        Should match the :py:func:`unicode.isdigit()` function.
        >>> is_digit(lookup(u'10'))
        True
        >>> is_digit(lookup(u'๐'))
        True
        >>> is_digit(lookup(u'one'))
        False
        """
        return self.orth_flags & 1 << IS_DIGIT
    cpdef bint is_punct(self) except *:
        """Check whether all characters belong to a punctuation unicode data category
        for a Lexeme ID.
        >>> is_punct(lookup(u'.'))
        True
        >>> is_punct(lookup(u'⁒'))
        True
        >>> is_punct(lookup(u' '))
        False
        """
        return self.orth_flags & 1 << IS_PUNCT
    cpdef bint is_space(self) except *:
        """Give the result of unicode.isspace() for a Lexeme ID.
        >>> is_space(lookup(u'\\t'))
        True
        >>> is_space(lookup(u'<unicode space>'))
        True
        >>> is_space(lookup(u'Hi\\n'))
        False
        """
        return self.orth_flags & 1 << IS_SPACE
    cpdef bint is_lower(self) except *:
        """Give the result of unicode.islower() for a Lexeme ID.
        >>> is_lower(lookup(u'hi'))
        True
        >>> is_lower(lookup(<unicode>))
        True
        >>> is_lower(lookup(u'10'))
        False
        """
        return self.orth_flags & 1 << IS_LOWER
    cpdef bint is_upper(self) except *:
        """Give the result of unicode.isupper() for a Lexeme ID.
        >>> is_upper(lookup(u'HI'))
        True
        >>> is_upper(lookup(u'H10'))
        True
        >>> is_upper(lookup(u'10'))
        False
        """
        return self.orth_flags & 1 << IS_UPPER
    cpdef bint is_title(self) except *:
        """Give the result of unicode.istitle() for a Lexeme ID.
        >>> is_title(lookup(u'Hi'))
        True
        >>> is_title(lookup(u'Hi1'))
        True
        >>> is_title(lookup(u'1'))
        False
        """
        return self.orth_flags & 1 << IS_TITLE
    cpdef bint is_ascii(self) except *:
        """Give the result of checking whether all characters in the string are ascii.
        >>> is_ascii(lookup(u'Hi'))
        True
        >>> is_ascii(lookup(u' '))
        True
        >>> is_title(lookup(u'<unicode>'))
        False
        """
        return self.orth_flags & 1 << IS_ASCII