* Add orthography

This commit is contained in:
Matthew Honnibal 2014-08-20 17:03:44 +02:00
parent d42cdbb446
commit f39dcb1d89
6 changed files with 3777 additions and 0 deletions

View File

View File

View File

3567
spacy/orthography/latin.cpp Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,33 @@
cdef enum OrthFlag:
IS_ALPHA
IS_DIGIT
IS_PUNCT
IS_SPACE
IS_LOWER
IS_UPPER
IS_TITLE
IS_ASCII
cdef enum:
LEX
LAST3
NORM
SHAPE
from spacy.lexeme cimport LexID
from spacy.lexeme cimport StringHash
cpdef bint is_alpha(LexID lex_id) except *
cpdef bint is_digit(LexID lex_id) except *
cpdef bint is_punct(LexID lex_id) except *
cpdef bint is_space(LexID lex_id) except *
cpdef bint is_lower(LexID lex_id) except *
cpdef bint is_upper(LexID lex_id) except *
cpdef bint is_title(LexID lex_id) except *
cpdef bint is_ascii(LexID lex_id) except *
cpdef StringHash norm_of(LexID lex_id) except 0
cpdef StringHash shape_of(LexID lex_id) except 0
cpdef StringHash last3_of(LexID lex_id) except 0

177
spacy/orthography/latin.pyx Normal file
View File

@ -0,0 +1,177 @@
# cython: embedsignature=True
from spacy.lexeme cimport Lexeme
def get_normalized(unicode lex):
if lex.isalpha() and lex.islower():
return lex
else:
return get_word_shape(lex)
def get_word_shape(unicode lex):
cdef size_t length = len(lex)
shape = ""
last = ""
shape_char = ""
seq = 0
for c in lex:
if c.isalpha():
if c.isupper():
shape_char = "X"
else:
shape_char = "x"
elif c.isdigit():
shape_char = "d"
else:
shape_char = c
if shape_char == last:
seq += 1
else:
seq = 0
last = shape_char
if seq < 3:
shape += shape_char
assert shape
return shape
cpdef bint is_alpha(LexID lex_id) except *:
"""Give the result of unicode.isalpha() for a Lexeme ID.
>>> is_alpha(lookup(u'Hello'))
True
>>> is_alpha(lookup(u'العرب'))
True
>>> is_alpha(lookup(u'10'))
False
"""
return (<Lexeme*>lex_id).orth_flags & 1 << IS_ALPHA
cpdef bint is_digit(LexID lex_id) except *:
"""Give the result of unicode.isdigit() for a Lexeme ID.
>>> is_digit(lookup(u'10'))
True
>>> is_digit(lookup(u''))
True
>>> is_digit(lookup(u'one'))
False
"""
return (<Lexeme*>lex_id).orth_flags & 1 << IS_DIGIT
cpdef bint is_punct(LexID lex_id) except *:
"""Give the result of checking whether all characters belong to a punctuation
unicode data category for a Lexeme ID.
>>> is_punct(lookup(u'.'))
True
>>> is_punct(lookup(u''))
True
>>> is_punct(lookup(u' '))
False
"""
return (<Lexeme*>lex_id).orth_flags & 1 << IS_PUNCT
cpdef bint is_space(LexID lex_id) except *:
"""Give the result of unicode.isspace() for a Lexeme ID.
>>> is_space(lookup(u'\t'))
True
>>> is_space(lookup(u'<unicode space>'))
True
>>> is_space(lookup(u'Hi\n'))
False
"""
return (<Lexeme*>lex_id).orth_flags & 1 << IS_SPACE
cpdef bint is_lower(LexID lex_id) except *:
"""Give the result of unicode.islower() for a Lexeme ID.
>>> is_lower(lookup(u'hi'))
True
>>> is_lower(lookup(<unicode>))
True
>>> is_lower(lookup(u'10'))
False
"""
return (<Lexeme*>lex_id).orth_flags & 1 << IS_LOWER
cpdef bint is_upper(LexID lex_id) except *:
"""Give the result of unicode.isupper() for a Lexeme ID.
>>> is_upper(lookup(u'HI'))
True
>>> is_upper(lookup(u'H10'))
True
>>> is_upper(lookup(u'10'))
False
"""
return (<Lexeme*>lex_id).orth_flags & 1 << IS_UPPER
cpdef bint is_title(LexID lex_id) except *:
"""Give the result of unicode.istitle() for a Lexeme ID.
>>> is_title(lookup(u'Hi'))
True
>>> is_title(lookup(u'Hi1'))
True
>>> is_title(lookup(u'1'))
False
"""
return (<Lexeme*>lex_id).orth_flags & 1 << IS_TITLE
cpdef bint is_ascii(LexID lex_id) except *:
"""Give the result of checking whether all characters in the string are ascii.
>>> is_ascii(lookup(u'Hi'))
True
>>> is_ascii(lookup(u' '))
True
>>> is_title(lookup(u'<unicode>'))
False
"""
return (<Lexeme*>lex_id).orth_flags & 1 << IS_ASCII
cpdef StringHash norm_of(LexID lex_id) except 0:
"""Return the hash of a normalized version of the string.
>>> unhash(norm_of(lookupu'Hi'))
u'hi'
>>> unhash(norm_of(lookup(u'255667')))
u'shape=dddd'
>>> unhash(norm_of(lookup(u'...')))
u'...'
"""
return (<Lexeme*>lex_id).string_views[NORM]
cpdef StringHash shape_of(LexID lex_id) except 0:
"""Return the hash of the string shape.
>>> unhash(shape_of(lookupu'Hi'))
u'Xx'
>>> unhash(shape_of(lookup(u'255667')))
u'dddd'
>>> unhash(shape_of(lookup(u'...')))
u'...'
"""
cdef Lexeme* w = <Lexeme*>lex_id
return w.string_views[SHAPE]
cpdef StringHash last3_of(LexID lex_id) except 0:
'''Access the `last3' field of the Lexeme pointed to by lex_id, which stores
the hash of the last three characters of the word:
>>> lex_ids = [lookup(w) for w in (u'Hello', u'!')]
>>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
[u'llo', u'!']
'''
return (<Lexeme*>lex_id).string_views[LAST3]