spaCy/spacy/string_tools.pyx
2014-08-16 03:22:03 +02:00

38 lines
806 B
Cython

# cython: profile=True
from murmurhash cimport mrmr
cpdef bytes to_bytes(unicode string):
return string.encode('utf8')
cpdef unicode from_bytes(bytes string):
return string.decode('utf8')
cpdef unicode substr(unicode string, int start, int end, size_t length):
if end >= length:
end = -1
if start >= length:
start = 0
if start <= 0 and end < 0:
return string
elif start < 0:
start = 0
elif end < 0:
end = length
return string[start:end]
cdef bint is_whitespace(Py_UNICODE c):
# TODO: Support other unicode spaces
# https://www.cs.tut.fi/~jkorpela/chars/spaces.html
if c == u' ':
return True
elif c == u'\n':
return True
elif c == u'\t':
return True
else:
return False