2014-08-20 19:03:44 +04:00
|
|
|
|
# cython: embedsignature=True
|
2014-08-22 01:49:14 +04:00
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
2014-08-20 19:03:44 +04:00
|
|
|
|
from spacy.lexeme cimport Lexeme
|
|
|
|
|
|
2014-08-22 01:49:14 +04:00
|
|
|
|
def get_normalized(unicode word):
|
|
|
|
|
"""Todo.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
word (unicode)
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
normalized (unicode)
|
|
|
|
|
"""
|
|
|
|
|
if word.isalpha() and word.islower():
|
|
|
|
|
return word
|
2014-08-20 19:03:44 +04:00
|
|
|
|
else:
|
2014-08-22 01:49:14 +04:00
|
|
|
|
return get_word_shape(word)
|
|
|
|
|
|
2014-08-20 19:03:44 +04:00
|
|
|
|
|
2014-08-22 01:49:14 +04:00
|
|
|
|
def get_word_shape(unicode word):
|
|
|
|
|
"""Todo.
|
2014-08-20 19:03:44 +04:00
|
|
|
|
|
2014-08-22 01:49:14 +04:00
|
|
|
|
Args:
|
|
|
|
|
word (unicode)
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
shape (unicode)
|
|
|
|
|
"""
|
|
|
|
|
cdef size_t length = len(word)
|
2014-08-20 19:03:44 +04:00
|
|
|
|
shape = ""
|
|
|
|
|
last = ""
|
|
|
|
|
shape_char = ""
|
|
|
|
|
seq = 0
|
2014-08-22 01:49:14 +04:00
|
|
|
|
for c in word:
|
2014-08-20 19:03:44 +04:00
|
|
|
|
if c.isalpha():
|
|
|
|
|
if c.isupper():
|
|
|
|
|
shape_char = "X"
|
|
|
|
|
else:
|
|
|
|
|
shape_char = "x"
|
|
|
|
|
elif c.isdigit():
|
|
|
|
|
shape_char = "d"
|
|
|
|
|
else:
|
|
|
|
|
shape_char = c
|
|
|
|
|
if shape_char == last:
|
|
|
|
|
seq += 1
|
|
|
|
|
else:
|
|
|
|
|
seq = 0
|
|
|
|
|
last = shape_char
|
|
|
|
|
if seq < 3:
|
|
|
|
|
shape += shape_char
|
|
|
|
|
assert shape
|
|
|
|
|
return shape
|
|
|
|
|
|
|
|
|
|
|
2014-08-22 01:49:14 +04:00
|
|
|
|
cpdef unicode get_last3(unicode string):
|
|
|
|
|
return string[-3:]
|
|
|
|
|
|
|
|
|
|
|
2014-08-20 19:03:44 +04:00
|
|
|
|
cpdef bint is_alpha(LexID lex_id) except *:
|
2014-08-22 01:49:14 +04:00
|
|
|
|
"""Check whether all characters in the word's string are alphabetic.
|
|
|
|
|
|
|
|
|
|
Should match the :py:func:`unicode.isalpha()` function.
|
2014-08-20 19:03:44 +04:00
|
|
|
|
|
|
|
|
|
>>> is_alpha(lookup(u'Hello'))
|
|
|
|
|
True
|
|
|
|
|
>>> is_alpha(lookup(u'العرب'))
|
|
|
|
|
True
|
|
|
|
|
>>> is_alpha(lookup(u'10'))
|
|
|
|
|
False
|
|
|
|
|
"""
|
|
|
|
|
return (<Lexeme*>lex_id).orth_flags & 1 << IS_ALPHA
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cpdef bint is_digit(LexID lex_id) except *:
|
2014-08-22 01:49:14 +04:00
|
|
|
|
"""Check whether all characters in the word's string are numeric.
|
|
|
|
|
|
|
|
|
|
Should match the :py:func:`unicode.isdigit()` function.
|
2014-08-20 19:03:44 +04:00
|
|
|
|
|
|
|
|
|
>>> is_digit(lookup(u'10'))
|
|
|
|
|
True
|
|
|
|
|
>>> is_digit(lookup(u'๐'))
|
|
|
|
|
True
|
|
|
|
|
>>> is_digit(lookup(u'one'))
|
|
|
|
|
False
|
|
|
|
|
"""
|
|
|
|
|
return (<Lexeme*>lex_id).orth_flags & 1 << IS_DIGIT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cpdef bint is_punct(LexID lex_id) except *:
|
2014-08-22 01:49:14 +04:00
|
|
|
|
"""Check whether all characters belong to a punctuation unicode data category
|
|
|
|
|
for a Lexeme ID.
|
2014-08-20 19:03:44 +04:00
|
|
|
|
|
|
|
|
|
>>> is_punct(lookup(u'.'))
|
|
|
|
|
True
|
|
|
|
|
>>> is_punct(lookup(u'⁒'))
|
|
|
|
|
True
|
|
|
|
|
>>> is_punct(lookup(u' '))
|
|
|
|
|
False
|
|
|
|
|
"""
|
|
|
|
|
return (<Lexeme*>lex_id).orth_flags & 1 << IS_PUNCT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cpdef bint is_space(LexID lex_id) except *:
|
|
|
|
|
"""Give the result of unicode.isspace() for a Lexeme ID.
|
|
|
|
|
|
2014-08-22 01:49:14 +04:00
|
|
|
|
>>> is_space(lookup(u'\\t'))
|
2014-08-20 19:03:44 +04:00
|
|
|
|
True
|
|
|
|
|
>>> is_space(lookup(u'<unicode space>'))
|
|
|
|
|
True
|
2014-08-22 01:49:14 +04:00
|
|
|
|
>>> is_space(lookup(u'Hi\\n'))
|
2014-08-20 19:03:44 +04:00
|
|
|
|
False
|
|
|
|
|
"""
|
|
|
|
|
return (<Lexeme*>lex_id).orth_flags & 1 << IS_SPACE
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cpdef bint is_lower(LexID lex_id) except *:
|
|
|
|
|
"""Give the result of unicode.islower() for a Lexeme ID.
|
|
|
|
|
|
|
|
|
|
>>> is_lower(lookup(u'hi'))
|
|
|
|
|
True
|
|
|
|
|
>>> is_lower(lookup(<unicode>))
|
|
|
|
|
True
|
|
|
|
|
>>> is_lower(lookup(u'10'))
|
|
|
|
|
False
|
|
|
|
|
"""
|
|
|
|
|
return (<Lexeme*>lex_id).orth_flags & 1 << IS_LOWER
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cpdef bint is_upper(LexID lex_id) except *:
|
|
|
|
|
"""Give the result of unicode.isupper() for a Lexeme ID.
|
|
|
|
|
|
|
|
|
|
>>> is_upper(lookup(u'HI'))
|
|
|
|
|
True
|
|
|
|
|
>>> is_upper(lookup(u'H10'))
|
|
|
|
|
True
|
|
|
|
|
>>> is_upper(lookup(u'10'))
|
|
|
|
|
False
|
|
|
|
|
"""
|
|
|
|
|
return (<Lexeme*>lex_id).orth_flags & 1 << IS_UPPER
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cpdef bint is_title(LexID lex_id) except *:
|
|
|
|
|
"""Give the result of unicode.istitle() for a Lexeme ID.
|
|
|
|
|
|
|
|
|
|
>>> is_title(lookup(u'Hi'))
|
|
|
|
|
True
|
|
|
|
|
>>> is_title(lookup(u'Hi1'))
|
|
|
|
|
True
|
|
|
|
|
>>> is_title(lookup(u'1'))
|
|
|
|
|
False
|
|
|
|
|
"""
|
|
|
|
|
return (<Lexeme*>lex_id).orth_flags & 1 << IS_TITLE
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cpdef bint is_ascii(LexID lex_id) except *:
|
|
|
|
|
"""Give the result of checking whether all characters in the string are ascii.
|
|
|
|
|
|
|
|
|
|
>>> is_ascii(lookup(u'Hi'))
|
|
|
|
|
True
|
|
|
|
|
>>> is_ascii(lookup(u' '))
|
|
|
|
|
True
|
|
|
|
|
>>> is_title(lookup(u'<unicode>'))
|
|
|
|
|
False
|
|
|
|
|
"""
|
|
|
|
|
return (<Lexeme*>lex_id).orth_flags & 1 << IS_ASCII
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cpdef StringHash norm_of(LexID lex_id) except 0:
|
2014-08-21 18:37:13 +04:00
|
|
|
|
"""Return the hash of a "normalized" version of the string.
|
|
|
|
|
|
|
|
|
|
Normalized strings are intended to be less sparse, while still capturing
|
2014-08-22 01:49:14 +04:00
|
|
|
|
important lexical information. See :py:func:`spacy.latin.orthography.normalize_string`
|
|
|
|
|
for details of the normalization function.
|
2014-08-20 19:03:44 +04:00
|
|
|
|
|
|
|
|
|
>>> unhash(norm_of(lookupu'Hi'))
|
|
|
|
|
u'hi'
|
|
|
|
|
>>> unhash(norm_of(lookup(u'255667')))
|
|
|
|
|
u'shape=dddd'
|
|
|
|
|
>>> unhash(norm_of(lookup(u'...')))
|
|
|
|
|
u'...'
|
|
|
|
|
"""
|
|
|
|
|
return (<Lexeme*>lex_id).string_views[NORM]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cpdef StringHash shape_of(LexID lex_id) except 0:
|
2014-08-21 18:37:13 +04:00
|
|
|
|
"""Return the hash of a string describing the word's "orthograpgic shape".
|
|
|
|
|
|
2014-08-22 01:49:14 +04:00
|
|
|
|
Orthographic shapes are calculated by the :py:func:`spacy.orthography.latin.string_shape`
|
2014-08-21 18:37:13 +04:00
|
|
|
|
function. Word shape features have been found useful for NER and POS tagging,
|
|
|
|
|
e.g. Manning (2011)
|
2014-08-20 19:03:44 +04:00
|
|
|
|
|
|
|
|
|
>>> unhash(shape_of(lookupu'Hi'))
|
|
|
|
|
u'Xx'
|
|
|
|
|
>>> unhash(shape_of(lookup(u'255667')))
|
|
|
|
|
u'dddd'
|
|
|
|
|
>>> unhash(shape_of(lookup(u'...')))
|
|
|
|
|
u'...'
|
|
|
|
|
"""
|
|
|
|
|
cdef Lexeme* w = <Lexeme*>lex_id
|
|
|
|
|
return w.string_views[SHAPE]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cpdef StringHash last3_of(LexID lex_id) except 0:
|
2014-08-21 18:37:13 +04:00
|
|
|
|
'''Return the hash of string[-3:], i.e. the last three characters of the word.
|
|
|
|
|
|
2014-08-20 19:03:44 +04:00
|
|
|
|
>>> lex_ids = [lookup(w) for w in (u'Hello', u'!')]
|
|
|
|
|
>>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
|
|
|
|
|
[u'llo', u'!']
|
|
|
|
|
'''
|
|
|
|
|
return (<Lexeme*>lex_id).string_views[LAST3]
|