mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
* Add orthography
This commit is contained in:
parent
d42cdbb446
commit
f39dcb1d89
0
spacy/orthography/__init__.pxd
Normal file
0
spacy/orthography/__init__.pxd
Normal file
0
spacy/orthography/__init__.py
Normal file
0
spacy/orthography/__init__.py
Normal file
0
spacy/orthography/__init__.pyx
Normal file
0
spacy/orthography/__init__.pyx
Normal file
3567
spacy/orthography/latin.cpp
Normal file
3567
spacy/orthography/latin.cpp
Normal file
File diff suppressed because it is too large
Load Diff
33
spacy/orthography/latin.pxd
Normal file
33
spacy/orthography/latin.pxd
Normal file
|
@ -0,0 +1,33 @@
|
|||
cdef enum OrthFlag:
|
||||
IS_ALPHA
|
||||
IS_DIGIT
|
||||
IS_PUNCT
|
||||
IS_SPACE
|
||||
IS_LOWER
|
||||
IS_UPPER
|
||||
IS_TITLE
|
||||
IS_ASCII
|
||||
|
||||
|
||||
cdef enum:
|
||||
LEX
|
||||
LAST3
|
||||
NORM
|
||||
SHAPE
|
||||
|
||||
from spacy.lexeme cimport LexID
|
||||
from spacy.lexeme cimport StringHash
|
||||
|
||||
cpdef bint is_alpha(LexID lex_id) except *
|
||||
cpdef bint is_digit(LexID lex_id) except *
|
||||
cpdef bint is_punct(LexID lex_id) except *
|
||||
cpdef bint is_space(LexID lex_id) except *
|
||||
cpdef bint is_lower(LexID lex_id) except *
|
||||
cpdef bint is_upper(LexID lex_id) except *
|
||||
cpdef bint is_title(LexID lex_id) except *
|
||||
cpdef bint is_ascii(LexID lex_id) except *
|
||||
|
||||
|
||||
cpdef StringHash norm_of(LexID lex_id) except 0
|
||||
cpdef StringHash shape_of(LexID lex_id) except 0
|
||||
cpdef StringHash last3_of(LexID lex_id) except 0
|
177
spacy/orthography/latin.pyx
Normal file
177
spacy/orthography/latin.pyx
Normal file
|
@ -0,0 +1,177 @@
|
|||
# cython: embedsignature=True
|
||||
from spacy.lexeme cimport Lexeme
|
||||
|
||||
def get_normalized(unicode lex):
|
||||
if lex.isalpha() and lex.islower():
|
||||
return lex
|
||||
else:
|
||||
return get_word_shape(lex)
|
||||
|
||||
|
||||
def get_word_shape(unicode lex):
|
||||
cdef size_t length = len(lex)
|
||||
shape = ""
|
||||
last = ""
|
||||
shape_char = ""
|
||||
seq = 0
|
||||
for c in lex:
|
||||
if c.isalpha():
|
||||
if c.isupper():
|
||||
shape_char = "X"
|
||||
else:
|
||||
shape_char = "x"
|
||||
elif c.isdigit():
|
||||
shape_char = "d"
|
||||
else:
|
||||
shape_char = c
|
||||
if shape_char == last:
|
||||
seq += 1
|
||||
else:
|
||||
seq = 0
|
||||
last = shape_char
|
||||
if seq < 3:
|
||||
shape += shape_char
|
||||
assert shape
|
||||
return shape
|
||||
|
||||
|
||||
cpdef bint is_alpha(LexID lex_id) except *:
|
||||
"""Give the result of unicode.isalpha() for a Lexeme ID.
|
||||
|
||||
>>> is_alpha(lookup(u'Hello'))
|
||||
True
|
||||
>>> is_alpha(lookup(u'العرب'))
|
||||
True
|
||||
>>> is_alpha(lookup(u'10'))
|
||||
False
|
||||
"""
|
||||
return (<Lexeme*>lex_id).orth_flags & 1 << IS_ALPHA
|
||||
|
||||
|
||||
cpdef bint is_digit(LexID lex_id) except *:
|
||||
"""Give the result of unicode.isdigit() for a Lexeme ID.
|
||||
|
||||
>>> is_digit(lookup(u'10'))
|
||||
True
|
||||
>>> is_digit(lookup(u'๐'))
|
||||
True
|
||||
>>> is_digit(lookup(u'one'))
|
||||
False
|
||||
"""
|
||||
return (<Lexeme*>lex_id).orth_flags & 1 << IS_DIGIT
|
||||
|
||||
|
||||
cpdef bint is_punct(LexID lex_id) except *:
|
||||
"""Give the result of checking whether all characters belong to a punctuation
|
||||
unicode data category for a Lexeme ID.
|
||||
|
||||
>>> is_punct(lookup(u'.'))
|
||||
True
|
||||
>>> is_punct(lookup(u'⁒'))
|
||||
True
|
||||
>>> is_punct(lookup(u' '))
|
||||
False
|
||||
"""
|
||||
return (<Lexeme*>lex_id).orth_flags & 1 << IS_PUNCT
|
||||
|
||||
|
||||
cpdef bint is_space(LexID lex_id) except *:
|
||||
"""Give the result of unicode.isspace() for a Lexeme ID.
|
||||
|
||||
>>> is_space(lookup(u'\t'))
|
||||
True
|
||||
>>> is_space(lookup(u'<unicode space>'))
|
||||
True
|
||||
>>> is_space(lookup(u'Hi\n'))
|
||||
False
|
||||
"""
|
||||
return (<Lexeme*>lex_id).orth_flags & 1 << IS_SPACE
|
||||
|
||||
|
||||
cpdef bint is_lower(LexID lex_id) except *:
|
||||
"""Give the result of unicode.islower() for a Lexeme ID.
|
||||
|
||||
>>> is_lower(lookup(u'hi'))
|
||||
True
|
||||
>>> is_lower(lookup(<unicode>))
|
||||
True
|
||||
>>> is_lower(lookup(u'10'))
|
||||
False
|
||||
"""
|
||||
return (<Lexeme*>lex_id).orth_flags & 1 << IS_LOWER
|
||||
|
||||
|
||||
cpdef bint is_upper(LexID lex_id) except *:
|
||||
"""Give the result of unicode.isupper() for a Lexeme ID.
|
||||
|
||||
>>> is_upper(lookup(u'HI'))
|
||||
True
|
||||
>>> is_upper(lookup(u'H10'))
|
||||
True
|
||||
>>> is_upper(lookup(u'10'))
|
||||
False
|
||||
"""
|
||||
return (<Lexeme*>lex_id).orth_flags & 1 << IS_UPPER
|
||||
|
||||
|
||||
cpdef bint is_title(LexID lex_id) except *:
|
||||
"""Give the result of unicode.istitle() for a Lexeme ID.
|
||||
|
||||
>>> is_title(lookup(u'Hi'))
|
||||
True
|
||||
>>> is_title(lookup(u'Hi1'))
|
||||
True
|
||||
>>> is_title(lookup(u'1'))
|
||||
False
|
||||
"""
|
||||
return (<Lexeme*>lex_id).orth_flags & 1 << IS_TITLE
|
||||
|
||||
|
||||
cpdef bint is_ascii(LexID lex_id) except *:
|
||||
"""Give the result of checking whether all characters in the string are ascii.
|
||||
|
||||
>>> is_ascii(lookup(u'Hi'))
|
||||
True
|
||||
>>> is_ascii(lookup(u' '))
|
||||
True
|
||||
>>> is_title(lookup(u'<unicode>'))
|
||||
False
|
||||
"""
|
||||
return (<Lexeme*>lex_id).orth_flags & 1 << IS_ASCII
|
||||
|
||||
|
||||
cpdef StringHash norm_of(LexID lex_id) except 0:
|
||||
"""Return the hash of a normalized version of the string.
|
||||
|
||||
>>> unhash(norm_of(lookupu'Hi'))
|
||||
u'hi'
|
||||
>>> unhash(norm_of(lookup(u'255667')))
|
||||
u'shape=dddd'
|
||||
>>> unhash(norm_of(lookup(u'...')))
|
||||
u'...'
|
||||
"""
|
||||
return (<Lexeme*>lex_id).string_views[NORM]
|
||||
|
||||
|
||||
cpdef StringHash shape_of(LexID lex_id) except 0:
|
||||
"""Return the hash of the string shape.
|
||||
|
||||
>>> unhash(shape_of(lookupu'Hi'))
|
||||
u'Xx'
|
||||
>>> unhash(shape_of(lookup(u'255667')))
|
||||
u'dddd'
|
||||
>>> unhash(shape_of(lookup(u'...')))
|
||||
u'...'
|
||||
"""
|
||||
cdef Lexeme* w = <Lexeme*>lex_id
|
||||
return w.string_views[SHAPE]
|
||||
|
||||
|
||||
cpdef StringHash last3_of(LexID lex_id) except 0:
|
||||
'''Access the `last3' field of the Lexeme pointed to by lex_id, which stores
|
||||
the hash of the last three characters of the word:
|
||||
>>> lex_ids = [lookup(w) for w in (u'Hello', u'!')]
|
||||
>>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
|
||||
[u'llo', u'!']
|
||||
'''
|
||||
return (<Lexeme*>lex_id).string_views[LAST3]
|
Loading…
Reference in New Issue
Block a user