* Add orthography

2025-11-28 05:45:44 +03:00 · 2014-08-20 17:03:44 +02:00 · 2014-08-20 17:03:44 +02:00 · f39dcb1d89
commit f39dcb1d89
parent d42cdbb446
6 changed files with 3777 additions and 0 deletions
--- a/spacy/orthography/init.pxd
+++ b/spacy/orthography/init.pxd
--- a/spacy/orthography/init.py
+++ b/spacy/orthography/init.py
--- a/spacy/orthography/init.pyx
+++ b/spacy/orthography/init.pyx
--- a/spacy/orthography/latin.cpp
+++ b/spacy/orthography/latin.cpp
--- a/spacy/orthography/latin.pxd
+++ b/spacy/orthography/latin.pxd
@ -0,0 +1,33 @@
+cdef enum OrthFlag:
+    IS_ALPHA
+    IS_DIGIT
+    IS_PUNCT
+    IS_SPACE
+    IS_LOWER
+    IS_UPPER
+    IS_TITLE
+    IS_ASCII
+
+
+cdef enum:
+    LEX
+    LAST3
+    NORM
+    SHAPE
+
+from spacy.lexeme cimport LexID
+from spacy.lexeme cimport StringHash
+
+cpdef bint is_alpha(LexID lex_id) except *
+cpdef bint is_digit(LexID lex_id) except *
+cpdef bint is_punct(LexID lex_id) except *
+cpdef bint is_space(LexID lex_id) except *
+cpdef bint is_lower(LexID lex_id) except *
+cpdef bint is_upper(LexID lex_id) except *
+cpdef bint is_title(LexID lex_id) except *
+cpdef bint is_ascii(LexID lex_id) except *
+
+
+cpdef StringHash norm_of(LexID lex_id) except 0
+cpdef StringHash shape_of(LexID lex_id) except 0
+cpdef StringHash last3_of(LexID lex_id) except 0
--- a/spacy/orthography/latin.pyx
+++ b/spacy/orthography/latin.pyx
@ -0,0 +1,177 @@
+# cython: embedsignature=True
+from spacy.lexeme cimport Lexeme
+
+def get_normalized(unicode lex):
+    if lex.isalpha() and lex.islower():
+        return lex
+    else:
+        return get_word_shape(lex)
+
+
+def get_word_shape(unicode lex):
+    cdef size_t length = len(lex)
+    shape = ""
+    last = ""
+    shape_char = ""
+    seq = 0
+    for c in lex:
+        if c.isalpha():
+            if c.isupper():
+                shape_char = "X"
+            else:
+                shape_char = "x"
+        elif c.isdigit():
+            shape_char = "d"
+        else:
+            shape_char = c
+        if shape_char == last:
+            seq += 1
+        else:
+            seq = 0
+            last = shape_char
+        if seq < 3:
+            shape += shape_char
+    assert shape
+    return shape
+
+
+cpdef bint is_alpha(LexID lex_id) except *:
+    """Give the result of unicode.isalpha() for a Lexeme ID.
+
+    >>> is_alpha(lookup(u'Hello'))
+    True
+    >>> is_alpha(lookup(u'العرب'))
+    True
+    >>> is_alpha(lookup(u'10'))
+    False
+    """
+    return (<Lexeme*>lex_id).orth_flags & 1 << IS_ALPHA
+
+
+cpdef bint is_digit(LexID lex_id) except *:
+    """Give the result of unicode.isdigit() for a Lexeme ID.
+
+    >>> is_digit(lookup(u'10'))
+    True
+    >>> is_digit(lookup(u'๐'))
+    True
+    >>> is_digit(lookup(u'one'))
+    False
+    """
+    return (<Lexeme*>lex_id).orth_flags & 1 << IS_DIGIT
+
+
+cpdef bint is_punct(LexID lex_id) except *:
+    """Give the result of checking whether all characters belong to a punctuation
+    unicode data category for a Lexeme ID.
+
+    >>> is_punct(lookup(u'.'))
+    True
+    >>> is_punct(lookup(u'⁒'))
+    True
+    >>> is_punct(lookup(u' '))
+    False
+    """
+    return (<Lexeme*>lex_id).orth_flags & 1 << IS_PUNCT
+
+
+cpdef bint is_space(LexID lex_id) except *:
+    """Give the result of unicode.isspace() for a Lexeme ID.
+
+    >>> is_space(lookup(u'\t'))
+    True
+    >>> is_space(lookup(u'<unicode space>'))
+    True
+    >>> is_space(lookup(u'Hi\n'))
+    False
+    """
+    return (<Lexeme*>lex_id).orth_flags & 1 << IS_SPACE
+
+
+cpdef bint is_lower(LexID lex_id) except *:
+    """Give the result of unicode.islower() for a Lexeme ID.
+
+    >>> is_lower(lookup(u'hi'))
+    True
+    >>> is_lower(lookup(<unicode>))
+    True
+    >>> is_lower(lookup(u'10'))
+    False
+    """
+    return (<Lexeme*>lex_id).orth_flags & 1 << IS_LOWER
+
+
+cpdef bint is_upper(LexID lex_id) except *:
+    """Give the result of unicode.isupper() for a Lexeme ID.
+
+    >>> is_upper(lookup(u'HI'))
+    True
+    >>> is_upper(lookup(u'H10'))
+    True
+    >>> is_upper(lookup(u'10'))
+    False
+    """
+    return (<Lexeme*>lex_id).orth_flags & 1 << IS_UPPER
+
+
+cpdef bint is_title(LexID lex_id) except *:
+    """Give the result of unicode.istitle() for a Lexeme ID.
+
+    >>> is_title(lookup(u'Hi'))
+    True
+    >>> is_title(lookup(u'Hi1'))
+    True
+    >>> is_title(lookup(u'1'))
+    False
+    """
+    return (<Lexeme*>lex_id).orth_flags & 1 << IS_TITLE
+
+
+cpdef bint is_ascii(LexID lex_id) except *:
+    """Give the result of checking whether all characters in the string are ascii.
+
+    >>> is_ascii(lookup(u'Hi'))
+    True
+    >>> is_ascii(lookup(u' '))
+    True
+    >>> is_title(lookup(u'<unicode>'))
+    False
+    """
+    return (<Lexeme*>lex_id).orth_flags & 1 << IS_ASCII
+
+
+cpdef StringHash norm_of(LexID lex_id) except 0:
+    """Return the hash of a normalized version of the string.
+
+    >>> unhash(norm_of(lookupu'Hi'))
+    u'hi'
+    >>> unhash(norm_of(lookup(u'255667')))
+    u'shape=dddd'
+    >>> unhash(norm_of(lookup(u'...')))
+    u'...'
+    """
+    return (<Lexeme*>lex_id).string_views[NORM]
+
+
+cpdef StringHash shape_of(LexID lex_id) except 0:
+    """Return the hash of the string shape.
+
+    >>> unhash(shape_of(lookupu'Hi'))
+    u'Xx'
+    >>> unhash(shape_of(lookup(u'255667')))
+    u'dddd'
+    >>> unhash(shape_of(lookup(u'...')))
+    u'...'
+    """
+    cdef Lexeme* w = <Lexeme*>lex_id
+    return w.string_views[SHAPE]
+
+
+cpdef StringHash last3_of(LexID lex_id) except 0:
+    '''Access the `last3' field of the Lexeme pointed to by lex_id, which stores
+    the hash of the last three characters of the word:
+    >>> lex_ids = [lookup(w) for w in (u'Hello', u'!')]
+    >>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
+    [u'llo', u'!']
+    '''
+    return (<Lexeme*>lex_id).string_views[LAST3]