* Add orthography

2025-10-26 05:31:15 +03:00 · 2014-08-20 17:03:44 +02:00 · 2014-08-20 17:03:44 +02:00 · f39dcb1d89
commit f39dcb1d89
parent d42cdbb446
6 changed files with 3777 additions and 0 deletions
--- a/spacy/orthography/init.pxd
+++ b/spacy/orthography/init.pxd
--- a/spacy/orthography/init.py
+++ b/spacy/orthography/init.py
--- a/spacy/orthography/init.pyx
+++ b/spacy/orthography/init.pyx
--- a/spacy/orthography/latin.cpp
+++ b/spacy/orthography/latin.cpp
--- a/spacy/orthography/latin.pxd
+++ b/spacy/orthography/latin.pxd
@ -0,0 +1,33 @@
 cdef enum OrthFlag:
    IS_ALPHA
    IS_DIGIT
    IS_PUNCT
    IS_SPACE
    IS_LOWER
    IS_UPPER
    IS_TITLE
    IS_ASCII
 cdef enum:
    LEX
    LAST3
    NORM
    SHAPE
 from spacy.lexeme cimport LexID
 from spacy.lexeme cimport StringHash
 cpdef bint is_alpha(LexID lex_id) except *
 cpdef bint is_digit(LexID lex_id) except *
 cpdef bint is_punct(LexID lex_id) except *
 cpdef bint is_space(LexID lex_id) except *
 cpdef bint is_lower(LexID lex_id) except *
 cpdef bint is_upper(LexID lex_id) except *
 cpdef bint is_title(LexID lex_id) except *
 cpdef bint is_ascii(LexID lex_id) except *
 cpdef StringHash norm_of(LexID lex_id) except 0
 cpdef StringHash shape_of(LexID lex_id) except 0
 cpdef StringHash last3_of(LexID lex_id) except 0
--- a/spacy/orthography/latin.pyx
+++ b/spacy/orthography/latin.pyx
@ -0,0 +1,177 @@
 # cython: embedsignature=True
 from spacy.lexeme cimport Lexeme
 def get_normalized(unicode lex):
    if lex.isalpha() and lex.islower():
        return lex
    else:
        return get_word_shape(lex)
 def get_word_shape(unicode lex):
    cdef size_t length = len(lex)
    shape = ""
    last = ""
    shape_char = ""
    seq = 0
    for c in lex:
        if c.isalpha():
            if c.isupper():
                shape_char = "X"
            else:
                shape_char = "x"
        elif c.isdigit():
            shape_char = "d"
        else:
            shape_char = c
        if shape_char == last:
            seq += 1
        else:
            seq = 0
            last = shape_char
        if seq < 3:
            shape += shape_char
    assert shape
    return shape
 cpdef bint is_alpha(LexID lex_id) except *:
    """Give the result of unicode.isalpha() for a Lexeme ID.
    >>> is_alpha(lookup(u'Hello'))
    True
    >>> is_alpha(lookup(u'العرب'))
    True
    >>> is_alpha(lookup(u'10'))
    False
    """
    return (<Lexeme*>lex_id).orth_flags & 1 << IS_ALPHA
 cpdef bint is_digit(LexID lex_id) except *:
    """Give the result of unicode.isdigit() for a Lexeme ID.
    >>> is_digit(lookup(u'10'))
    True
    >>> is_digit(lookup(u'๐'))
    True
    >>> is_digit(lookup(u'one'))
    False
    """
    return (<Lexeme*>lex_id).orth_flags & 1 << IS_DIGIT
 cpdef bint is_punct(LexID lex_id) except *:
    """Give the result of checking whether all characters belong to a punctuation
    unicode data category for a Lexeme ID.
    >>> is_punct(lookup(u'.'))
    True
    >>> is_punct(lookup(u'⁒'))
    True
    >>> is_punct(lookup(u' '))
    False
    """
    return (<Lexeme*>lex_id).orth_flags & 1 << IS_PUNCT
 cpdef bint is_space(LexID lex_id) except *:
    """Give the result of unicode.isspace() for a Lexeme ID.
    >>> is_space(lookup(u'\t'))
    True
    >>> is_space(lookup(u'<unicode space>'))
    True
    >>> is_space(lookup(u'Hi\n'))
    False
    """
    return (<Lexeme*>lex_id).orth_flags & 1 << IS_SPACE
 cpdef bint is_lower(LexID lex_id) except *:
    """Give the result of unicode.islower() for a Lexeme ID.
    >>> is_lower(lookup(u'hi'))
    True
    >>> is_lower(lookup(<unicode>))
    True
    >>> is_lower(lookup(u'10'))
    False
    """
    return (<Lexeme*>lex_id).orth_flags & 1 << IS_LOWER
 cpdef bint is_upper(LexID lex_id) except *:
    """Give the result of unicode.isupper() for a Lexeme ID.
    >>> is_upper(lookup(u'HI'))
    True
    >>> is_upper(lookup(u'H10'))
    True
    >>> is_upper(lookup(u'10'))
    False
    """
    return (<Lexeme*>lex_id).orth_flags & 1 << IS_UPPER
 cpdef bint is_title(LexID lex_id) except *:
    """Give the result of unicode.istitle() for a Lexeme ID.
    >>> is_title(lookup(u'Hi'))
    True
    >>> is_title(lookup(u'Hi1'))
    True
    >>> is_title(lookup(u'1'))
    False
    """
    return (<Lexeme*>lex_id).orth_flags & 1 << IS_TITLE
 cpdef bint is_ascii(LexID lex_id) except *:
    """Give the result of checking whether all characters in the string are ascii.
    >>> is_ascii(lookup(u'Hi'))
    True
    >>> is_ascii(lookup(u' '))
    True
    >>> is_title(lookup(u'<unicode>'))
    False
    """
    return (<Lexeme*>lex_id).orth_flags & 1 << IS_ASCII
 cpdef StringHash norm_of(LexID lex_id) except 0:
    """Return the hash of a normalized version of the string.
    >>> unhash(norm_of(lookupu'Hi'))
    u'hi'
    >>> unhash(norm_of(lookup(u'255667')))
    u'shape=dddd'
    >>> unhash(norm_of(lookup(u'...')))
    u'...'
    """
    return (<Lexeme*>lex_id).string_views[NORM]
 cpdef StringHash shape_of(LexID lex_id) except 0:
    """Return the hash of the string shape.
    >>> unhash(shape_of(lookupu'Hi'))
    u'Xx'
    >>> unhash(shape_of(lookup(u'255667')))
    u'dddd'
    >>> unhash(shape_of(lookup(u'...')))
    u'...'
    """
    cdef Lexeme* w = <Lexeme*>lex_id
    return w.string_views[SHAPE]
 cpdef StringHash last3_of(LexID lex_id) except 0:
    '''Access the `last3' field of the Lexeme pointed to by lex_id, which stores
    the hash of the last three characters of the word:
    >>> lex_ids = [lookup(w) for w in (u'Hello', u'!')]
    >>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
    [u'llo', u'!']
    '''
    return (<Lexeme*>lex_id).string_views[LAST3]