mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 12:18:04 +03:00
73 lines
2.0 KiB
Cython
73 lines
2.0 KiB
Cython
from __future__ import unicode_literals
|
|
from spacy.lexeme cimport Lexeme
|
|
|
|
|
|
cpdef vector[size_t] expand_chunk(size_t addr) except *:
|
|
cdef vector[size_t] tokens = vector[size_t]()
|
|
word = <Lexeme*>addr
|
|
while word is not NULL:
|
|
tokens.push_back(<size_t>word)
|
|
word = word.tail
|
|
return tokens
|
|
|
|
|
|
"""
|
|
cpdef vector[size_t] ids_from_text(unicode text) except *:
|
|
cdef size_t length = len(text)
|
|
cdef Py_UNICODE* characters = <Py_UNICODE*>text
|
|
|
|
cdef size_t i
|
|
cdef Py_UNICODE c
|
|
|
|
cdef vector[size_t] tokens = vector[size_t]()
|
|
cdef unicode current = u''
|
|
cdef Lexeme* token
|
|
cdef int alnum_end = -1
|
|
cdef size_t alnum_start = 0
|
|
cdef bint seen_alnum = False
|
|
for i in range(length):
|
|
c = characters[i]
|
|
if is_whitespace(c):
|
|
token = <Lexeme*>lookup(current)
|
|
tokens.push_back(<size_t>token)
|
|
clitic = 0
|
|
while token.clitics[clitic]:
|
|
tokens.push_back(token.clitics[clitic])
|
|
clitic += 1
|
|
current = u''
|
|
alnum_start = 0
|
|
alnum_end = -1
|
|
seen_alnum = False
|
|
else:
|
|
if not seen_alnum and c.isalnum():
|
|
alnum_start = i
|
|
seen_alnum = True
|
|
elif seen_alnum and alnum_end == -1 and not c.isalnum():
|
|
alnum_end = i
|
|
current += c
|
|
if current:
|
|
token = <Lexeme*>lookup(current)
|
|
tokens.push_back(<size_t>token)
|
|
clitic = 0
|
|
while token.clitics[clitic]:
|
|
tokens.push_back(token.clitics[clitic])
|
|
clitic += 1
|
|
return tokens
|
|
"""
|
|
|
|
#cdef vector[Tokens] group_by(Tokens tokens, LexAttr field) except *:
|
|
# pass
|
|
|
|
|
|
cdef inline bint is_whitespace(Py_UNICODE c):
|
|
# TODO: Support other unicode spaces
|
|
# https://www.cs.tut.fi/~jkorpela/chars/spaces.html
|
|
if c == u' ':
|
|
return True
|
|
elif c == u'\n':
|
|
return True
|
|
elif c == u'\t':
|
|
return True
|
|
else:
|
|
return False
|