mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
* Reorganized, moving language-independent stuff to spacy. The functions in spacy ask for the dictionaries and split function on input, but the language-specific modules are curried versions that use the globals
This commit is contained in:
parent
a62c38e1ef
commit
d5bef02c72
146129
data/en/case
Normal file
146129
data/en/case
Normal file
File diff suppressed because it is too large
Load Diff
316709
data/en/clusters
Normal file
316709
data/en/clusters
Normal file
File diff suppressed because it is too large
Load Diff
93
data/en/tokenization
Normal file
93
data/en/tokenization
Normal file
|
@ -0,0 +1,93 @@
|
||||||
|
# https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions
|
||||||
|
# 21:09, 25 June 2014
|
||||||
|
#*--* --
|
||||||
|
#*---* ---
|
||||||
|
#*'s 's
|
||||||
|
|
||||||
|
ain't are not
|
||||||
|
aren't are not
|
||||||
|
can't can not
|
||||||
|
could've could have
|
||||||
|
couldn't could not
|
||||||
|
couldn't've could not have
|
||||||
|
didn't did not
|
||||||
|
doesn't does not
|
||||||
|
don't do not
|
||||||
|
hadn't had not
|
||||||
|
hadn't've had not have
|
||||||
|
hasn't has not
|
||||||
|
haven't have not
|
||||||
|
he'd he would
|
||||||
|
he'd've he would have
|
||||||
|
he'll he will
|
||||||
|
he's he 's
|
||||||
|
how'd he would
|
||||||
|
how'll he will
|
||||||
|
how's how 's
|
||||||
|
I'd I would
|
||||||
|
I'd've I would have
|
||||||
|
I'll I will
|
||||||
|
I'm I am
|
||||||
|
I've I have
|
||||||
|
isn't is not
|
||||||
|
it'd it would
|
||||||
|
it'd've it would have
|
||||||
|
it'll it will
|
||||||
|
it's it 's
|
||||||
|
let's let 's
|
||||||
|
mightn't might not
|
||||||
|
mightn't've might not have
|
||||||
|
might've might have
|
||||||
|
mustn't must not
|
||||||
|
must've must have
|
||||||
|
needn't need not
|
||||||
|
not've not have
|
||||||
|
shan't shall not
|
||||||
|
she'd she would
|
||||||
|
she'd've she would have
|
||||||
|
she'll she will
|
||||||
|
she's she 's
|
||||||
|
should've should have
|
||||||
|
shouldn't should not
|
||||||
|
shouldn't've should not have
|
||||||
|
that's that 's
|
||||||
|
there'd there would
|
||||||
|
there'd've there would have
|
||||||
|
there's there is
|
||||||
|
they'd there would
|
||||||
|
they'd've they would have
|
||||||
|
they'll they will
|
||||||
|
they're they are
|
||||||
|
they've they have
|
||||||
|
wasn't was not
|
||||||
|
we'd we would
|
||||||
|
we'd've we would have
|
||||||
|
we'll we will
|
||||||
|
we're we are
|
||||||
|
we've we have
|
||||||
|
weren't were not
|
||||||
|
what'll what will
|
||||||
|
what're what are
|
||||||
|
what's what 's
|
||||||
|
what've what have
|
||||||
|
when's when 's
|
||||||
|
where'd where would
|
||||||
|
where's where 's
|
||||||
|
where've where have
|
||||||
|
who'd who would
|
||||||
|
who'll who will
|
||||||
|
who're who are
|
||||||
|
who's who 's
|
||||||
|
who've who have
|
||||||
|
why'll who will
|
||||||
|
why're why are
|
||||||
|
why's why is
|
||||||
|
won't will not
|
||||||
|
would've would have
|
||||||
|
wouldn't would not
|
||||||
|
wouldn't've would not have
|
||||||
|
you'd you would
|
||||||
|
you'd've you would have
|
||||||
|
you'll you will
|
||||||
|
you're you are
|
||||||
|
you've you have
|
4001
spacy/en.cpp
4001
spacy/en.cpp
File diff suppressed because it is too large
Load Diff
18
spacy/en.pxd
18
spacy/en.pxd
|
@ -1,17 +1,15 @@
|
||||||
from ext.sparsehash cimport dense_hash_map
|
from libcpp.vector cimport vector
|
||||||
from spacy.lexeme cimport StringHash
|
|
||||||
|
from spacy.spacy cimport StringHash
|
||||||
|
from spacy.spacy cimport Vocab
|
||||||
from spacy.lexeme cimport Lexeme
|
from spacy.lexeme cimport Lexeme
|
||||||
|
from spacy.lexeme cimport Lexeme_addr
|
||||||
|
|
||||||
|
|
||||||
ctypedef Py_UNICODE* string_ptr
|
cdef Vocab VOCAB
|
||||||
ctypedef size_t Lexeme_addr # For python interop
|
cdef dict BACOV
|
||||||
ctypedef Lexeme* Lexeme_ptr
|
|
||||||
|
|
||||||
|
|
||||||
cdef dense_hash_map[StringHash, Lexeme_ptr] LEXEMES
|
|
||||||
|
|
||||||
|
|
||||||
cpdef Lexeme_addr lookup(unicode word) except 0
|
cpdef Lexeme_addr lookup(unicode word) except 0
|
||||||
cpdef Lexeme_addr lookup_chunk(unicode chunk, int start, int end) except 0
|
cpdef vector[Lexeme_addr] tokenize(unicode string) except *
|
||||||
cdef StringHash hash_string(unicode s, size_t length) except 0
|
|
||||||
cpdef unicode unhash(StringHash hash_value)
|
cpdef unicode unhash(StringHash hash_value)
|
||||||
|
|
204
spacy/en.pyx
204
spacy/en.pyx
|
@ -9,211 +9,43 @@ from libc.stdint cimport uint64_t
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
|
|
||||||
from spacy.lexeme cimport Lexeme
|
from spacy.lexeme cimport Lexeme
|
||||||
from ext.murmurhash cimport MurmurHash64A
|
from spacy.string_tools cimport substr
|
||||||
from ext.murmurhash cimport MurmurHash64B
|
|
||||||
from . import util
|
from . import util
|
||||||
|
|
||||||
|
cimport spacy
|
||||||
|
|
||||||
STRINGS = {}
|
BACOV = {}
|
||||||
LEXEMES = dense_hash_map[StringHash, Lexeme_ptr]()
|
VOCAB = Vocab()
|
||||||
LEXEMES.set_empty_key(0)
|
VOCAB.set_empty_key(0)
|
||||||
|
|
||||||
|
|
||||||
cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
|
spacy.load_tokenization(VOCAB, BACOV, util.read_tokenization('en'))
|
||||||
|
|
||||||
|
|
||||||
def load_tokenization(token_rules):
|
|
||||||
cdef Lexeme* word
|
|
||||||
cdef StringHash hashed
|
|
||||||
for chunk, lex, tokens in token_rules:
|
|
||||||
hashed = hash_string(chunk, len(chunk))
|
|
||||||
assert LEXEMES[hashed] == NULL
|
|
||||||
word = _add(hashed, lex, len(lex), len(lex))
|
|
||||||
for i, lex in enumerate(tokens):
|
|
||||||
token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
|
|
||||||
length = len(token_string)
|
|
||||||
hashed = hash_string(token_string, length)
|
|
||||||
word.tail = _add(hashed, lex, 0, len(lex))
|
|
||||||
word = word.tail
|
|
||||||
|
|
||||||
|
|
||||||
load_tokenization(util.read_tokenization('en'))
|
|
||||||
|
|
||||||
|
|
||||||
cpdef vector[Lexeme_addr] tokenize(unicode string) except *:
|
cpdef vector[Lexeme_addr] tokenize(unicode string) except *:
|
||||||
cdef size_t length = len(string)
|
return spacy.tokenize(VOCAB, BACOV, find_split, string)
|
||||||
cdef Py_UNICODE* characters = <Py_UNICODE*>string
|
|
||||||
|
|
||||||
cdef size_t i
|
|
||||||
cdef Py_UNICODE c
|
|
||||||
|
|
||||||
cdef vector[Lexeme_addr] tokens = vector[Lexeme_addr]()
|
|
||||||
cdef unicode current = u''
|
|
||||||
cdef Lexeme* token
|
|
||||||
for i in range(length):
|
|
||||||
c = characters[i]
|
|
||||||
if is_whitespace(c):
|
|
||||||
if current:
|
|
||||||
token = <Lexeme*>lookup(current)
|
|
||||||
while token != NULL:
|
|
||||||
tokens.push_back(<Lexeme_addr>token)
|
|
||||||
token = token.tail
|
|
||||||
current = u''
|
|
||||||
else:
|
|
||||||
current += c
|
|
||||||
if current:
|
|
||||||
token = <Lexeme*>lookup(current)
|
|
||||||
while token != NULL:
|
|
||||||
tokens.push_back(<Lexeme_addr>token)
|
|
||||||
token = token.tail
|
|
||||||
return tokens
|
|
||||||
|
|
||||||
cdef inline bint is_whitespace(Py_UNICODE c):
|
|
||||||
# TODO: Support other unicode spaces
|
|
||||||
# https://www.cs.tut.fi/~jkorpela/chars/spaces.html
|
|
||||||
if c == u' ':
|
|
||||||
return True
|
|
||||||
elif c == u'\n':
|
|
||||||
return True
|
|
||||||
elif c == u'\t':
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
cpdef Lexeme_addr lookup(unicode string) except 0:
|
cpdef Lexeme_addr lookup(unicode string) except 0:
|
||||||
'''.. function:: enumerate(sequence[, start=0])
|
return spacy.lookup(VOCAB, BACOV, find_split, -1, string)
|
||||||
Fetch a Lexeme representing a word string. If the word has not been seen,
|
|
||||||
construct one, splitting off any attached punctuation or clitics. A
|
|
||||||
reference to BLANK_WORD is returned for the empty string.
|
|
||||||
|
|
||||||
To specify the boundaries of the word if it has not been seen, use lookup_chunk.
|
|
||||||
'''
|
|
||||||
if string == '':
|
|
||||||
return <Lexeme_addr>&BLANK_WORD
|
|
||||||
cdef size_t length = len(string)
|
|
||||||
cdef StringHash hashed = hash_string(string, length)
|
|
||||||
cdef Lexeme* word_ptr = LEXEMES[hashed]
|
|
||||||
cdef size_t n
|
|
||||||
if word_ptr == NULL:
|
|
||||||
word_ptr = _add(hashed, string, _find_split(string, length), length)
|
|
||||||
return <Lexeme_addr>word_ptr
|
|
||||||
|
|
||||||
|
|
||||||
cpdef Lexeme_addr lookup_chunk(unicode string, int start, int end) except 0:
|
|
||||||
'''Fetch a Lexeme representing a word string. If the word has not been seen,
|
|
||||||
construct one, given the specified start and end indices. A negative index
|
|
||||||
significes 0 for start, and the string length for end --- i.e. the string
|
|
||||||
will not be sliced if start == -1 and end == -1.
|
|
||||||
|
|
||||||
A reference to BLANK_WORD is returned for the empty string.
|
|
||||||
'''
|
|
||||||
if string == '':
|
|
||||||
return <Lexeme_addr>&BLANK_WORD
|
|
||||||
cdef size_t length = len(string)
|
|
||||||
cdef StringHash hashed = hash_string(string, length)
|
|
||||||
cdef Lexeme* chunk_ptr = LEXEMES[hashed]
|
|
||||||
if chunk_ptr == NULL:
|
|
||||||
chunk_ptr = _add(hashed, string, start, length)
|
|
||||||
return <Lexeme_addr>chunk_ptr
|
|
||||||
|
|
||||||
|
|
||||||
cdef StringHash hash_string(unicode s, size_t length) except 0:
|
|
||||||
'''Hash unicode with MurmurHash64A'''
|
|
||||||
assert length
|
|
||||||
return MurmurHash64A(<string_ptr>s, length * sizeof(Py_UNICODE), 0)
|
|
||||||
|
|
||||||
|
|
||||||
cpdef unicode unhash(StringHash hash_value):
|
cpdef unicode unhash(StringHash hash_value):
|
||||||
'''Fetch a string from the reverse index, given its hash value.'''
|
return spacy.unhash(BACOV, hash_value)
|
||||||
cdef string_ptr string = STRINGS[hash_value]
|
|
||||||
if string == NULL:
|
|
||||||
raise ValueError(hash_value)
|
|
||||||
|
|
||||||
return string
|
|
||||||
|
|
||||||
|
|
||||||
cdef unicode normalize_word_string(unicode word):
|
cdef vector[StringHash] make_string_views(unicode word):
|
||||||
'''Return a normalized version of the word, mapping:
|
|
||||||
- 4 digit strings into !YEAR
|
|
||||||
- Other digit strings into !DIGITS
|
|
||||||
- All other strings into lower-case
|
|
||||||
'''
|
|
||||||
cdef unicode s
|
cdef unicode s
|
||||||
if word.isdigit() and len(word) == 4:
|
return vector[StringHash]()
|
||||||
return '!YEAR'
|
#if word.isdigit() and len(word) == 4:
|
||||||
elif word[0].isdigit():
|
# return '!YEAR'
|
||||||
return '!DIGITS'
|
#elif word[0].isdigit():
|
||||||
else:
|
# return '!DIGITS'
|
||||||
return word.lower()
|
#else:
|
||||||
|
# return word.lower()
|
||||||
|
|
||||||
|
|
||||||
cpdef unicode _substr(unicode string, int start, int end, size_t length):
|
cdef int find_split(unicode word, size_t length):
|
||||||
if end >= length:
|
|
||||||
end = -1
|
|
||||||
if start >= length:
|
|
||||||
start = 0
|
|
||||||
if start <= 0 and end < 0:
|
|
||||||
return string
|
|
||||||
elif start < 0:
|
|
||||||
start = 0
|
|
||||||
elif end < 0:
|
|
||||||
end = length
|
|
||||||
return string[start:end]
|
|
||||||
|
|
||||||
|
|
||||||
cdef Lexeme* _add(StringHash hashed, unicode string, int split, size_t length) except NULL:
|
|
||||||
assert string
|
|
||||||
assert split <= length
|
|
||||||
word = _init_lexeme(string, hashed, split, length)
|
|
||||||
LEXEMES[hashed] = word
|
|
||||||
STRINGS[hashed] = string
|
|
||||||
return word
|
|
||||||
|
|
||||||
|
|
||||||
cdef Lexeme* _init_lexeme(unicode string, StringHash hashed,
|
|
||||||
int split, size_t length) except NULL:
|
|
||||||
assert split <= length
|
|
||||||
cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
|
|
||||||
|
|
||||||
word.first = <Py_UNICODE>(string[0] if string else 0)
|
|
||||||
word.sic = hashed
|
|
||||||
|
|
||||||
cdef unicode tail_string
|
|
||||||
cdef unicode lex
|
|
||||||
if split != 0 and split < length:
|
|
||||||
lex = _substr(string, 0, split, length)
|
|
||||||
tail_string = _substr(string, split, length, length)
|
|
||||||
else:
|
|
||||||
lex = string
|
|
||||||
tail_string = ''
|
|
||||||
assert lex
|
|
||||||
cdef unicode normed = normalize_word_string(lex)
|
|
||||||
cdef unicode last3 = _substr(string, length - 3, length, length)
|
|
||||||
|
|
||||||
assert normed
|
|
||||||
assert len(normed)
|
|
||||||
|
|
||||||
word.lex = hash_string(lex, len(lex))
|
|
||||||
word.normed = hash_string(normed, len(normed))
|
|
||||||
word.last3 = hash_string(last3, len(last3))
|
|
||||||
|
|
||||||
STRINGS[word.lex] = lex
|
|
||||||
STRINGS[word.normed] = normed
|
|
||||||
STRINGS[word.last3] = last3
|
|
||||||
|
|
||||||
# These are loaded later
|
|
||||||
word.prob = 0
|
|
||||||
word.cluster = 0
|
|
||||||
word.oft_upper = False
|
|
||||||
word.oft_title = False
|
|
||||||
|
|
||||||
# Now recurse, and deal with the tail
|
|
||||||
if tail_string:
|
|
||||||
word.tail = <Lexeme*>lookup(tail_string)
|
|
||||||
return word
|
|
||||||
|
|
||||||
|
|
||||||
cdef size_t _find_split(unicode word, size_t length):
|
|
||||||
cdef int i = 0
|
cdef int i = 0
|
||||||
# Contractions
|
# Contractions
|
||||||
if word.endswith("'s"):
|
if word.endswith("'s"):
|
||||||
|
|
915
spacy/lexeme.cpp
915
spacy/lexeme.cpp
File diff suppressed because it is too large
Load Diff
|
@ -1,9 +1,12 @@
|
||||||
from libc.stdint cimport uint64_t
|
from libc.stdint cimport uint64_t
|
||||||
|
|
||||||
|
# Put these above import to avoid circular import problem
|
||||||
ctypedef int ClusterID
|
ctypedef int ClusterID
|
||||||
ctypedef uint64_t StringHash
|
ctypedef uint64_t StringHash
|
||||||
|
ctypedef size_t Lexeme_addr
|
||||||
|
|
||||||
|
from spacy.spacy cimport Vocab
|
||||||
|
from spacy.spacy cimport Splitter
|
||||||
|
|
||||||
cdef struct Lexeme:
|
cdef struct Lexeme:
|
||||||
StringHash sic # Hash of the original string
|
StringHash sic # Hash of the original string
|
||||||
|
@ -20,6 +23,12 @@ cdef struct Lexeme:
|
||||||
Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens
|
Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens
|
||||||
|
|
||||||
|
|
||||||
|
cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
|
||||||
|
|
||||||
|
cdef Lexeme* init_lexeme(Vocab vocab, dict bacov, Splitter find_split,
|
||||||
|
unicode string, StringHash hashed,
|
||||||
|
int split, size_t length) except NULL
|
||||||
|
|
||||||
# Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which
|
# Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which
|
||||||
# has a conditional to pick out the correct item. This allows safe iteration
|
# has a conditional to pick out the correct item. This allows safe iteration
|
||||||
# over the Lexeme, via:
|
# over the Lexeme, via:
|
||||||
|
|
|
@ -2,6 +2,60 @@
|
||||||
Mostly useful from Python-space. From Cython-space, you can just cast to
|
Mostly useful from Python-space. From Cython-space, you can just cast to
|
||||||
Lexeme* yourself.
|
Lexeme* yourself.
|
||||||
'''
|
'''
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from spacy.string_tools cimport substr
|
||||||
|
from spacy.spacy cimport hash_string
|
||||||
|
from spacy.spacy cimport lookup
|
||||||
|
|
||||||
|
from libc.stdlib cimport malloc, calloc, free
|
||||||
|
from libc.stdint cimport uint64_t
|
||||||
|
from libcpp.vector cimport vector
|
||||||
|
|
||||||
|
|
||||||
|
cdef Lexeme* init_lexeme(Vocab vocab, dict bacov, Splitter find_split,
|
||||||
|
unicode string, StringHash hashed,
|
||||||
|
int split, size_t length) except NULL:
|
||||||
|
assert split <= length
|
||||||
|
cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
|
||||||
|
|
||||||
|
word.first = <Py_UNICODE>(string[0] if string else 0)
|
||||||
|
word.sic = hashed
|
||||||
|
|
||||||
|
cdef unicode tail_string
|
||||||
|
cdef unicode lex
|
||||||
|
if split != 0 and split < length:
|
||||||
|
lex = substr(string, 0, split, length)
|
||||||
|
tail_string = substr(string, split, length, length)
|
||||||
|
else:
|
||||||
|
lex = string
|
||||||
|
tail_string = ''
|
||||||
|
assert lex
|
||||||
|
#cdef unicode normed = normalize_word_string(lex)
|
||||||
|
cdef unicode normed = '?'
|
||||||
|
cdef unicode last3 = substr(string, length - 3, length, length)
|
||||||
|
|
||||||
|
assert normed
|
||||||
|
assert len(normed)
|
||||||
|
|
||||||
|
word.lex = hash_string(lex, len(lex))
|
||||||
|
word.normed = hash_string(normed, len(normed))
|
||||||
|
word.last3 = hash_string(last3, len(last3))
|
||||||
|
|
||||||
|
bacov[word.lex] = lex
|
||||||
|
bacov[word.normed] = normed
|
||||||
|
bacov[word.last3] = last3
|
||||||
|
|
||||||
|
# These are loaded later
|
||||||
|
word.prob = 0
|
||||||
|
word.cluster = 0
|
||||||
|
word.oft_upper = False
|
||||||
|
word.oft_title = False
|
||||||
|
|
||||||
|
# Now recurse, and deal with the tail
|
||||||
|
if tail_string:
|
||||||
|
word.tail = <Lexeme*>lookup(vocab, bacov, find_split, -1, tail_string)
|
||||||
|
return word
|
||||||
|
|
||||||
|
|
||||||
cpdef StringHash sic_of(size_t lex_id) except 0:
|
cpdef StringHash sic_of(size_t lex_id) except 0:
|
||||||
|
|
1652
spacy/spacy.cpp
1652
spacy/spacy.cpp
File diff suppressed because it is too large
Load Diff
|
@ -1,5 +1,24 @@
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
|
from libc.stdint cimport uint64_t
|
||||||
|
|
||||||
|
from ext.sparsehash cimport dense_hash_map
|
||||||
|
|
||||||
|
# Circular import problems here
|
||||||
|
ctypedef size_t Lexeme_addr
|
||||||
|
ctypedef uint64_t StringHash
|
||||||
|
ctypedef dense_hash_map[StringHash, Lexeme_addr] Vocab
|
||||||
|
ctypedef int (*Splitter)(unicode word, size_t length)
|
||||||
|
|
||||||
|
|
||||||
from spacy.lexeme cimport Lexeme
|
from spacy.lexeme cimport Lexeme
|
||||||
|
|
||||||
|
cdef load_tokenization(Vocab& vocab, dict bacov, token_rules)
|
||||||
|
cdef vector[Lexeme_addr] tokenize(Vocab& vocab, dict bacov, Splitter splitter,
|
||||||
|
unicode string) except *
|
||||||
|
cdef Lexeme_addr lookup(Vocab& vocab, dict bacov, Splitter splitter, int start,
|
||||||
|
unicode string) except 0
|
||||||
|
cdef StringHash hash_string(unicode s, size_t length) except 0
|
||||||
|
cdef unicode unhash(dict bacov, StringHash hash_value)
|
||||||
|
|
||||||
|
|
||||||
cpdef vector[size_t] expand_chunk(size_t addr) except *
|
cpdef vector[size_t] expand_chunk(size_t addr) except *
|
||||||
|
|
|
@ -1,5 +1,78 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
from spacy.lexeme cimport Lexeme
|
|
||||||
|
from ext.murmurhash cimport MurmurHash64A
|
||||||
|
from ext.murmurhash cimport MurmurHash64B
|
||||||
|
|
||||||
|
from spacy.lexeme cimport init_lexeme
|
||||||
|
from spacy.lexeme cimport BLANK_WORD
|
||||||
|
|
||||||
|
from spacy.string_tools cimport is_whitespace
|
||||||
|
|
||||||
|
from . import util
|
||||||
|
|
||||||
|
|
||||||
|
cdef load_tokenization(Vocab& vocab, dict bacov, token_rules):
|
||||||
|
cdef Lexeme* word
|
||||||
|
cdef StringHash hashed
|
||||||
|
for chunk, lex, tokens in token_rules:
|
||||||
|
hashed = hash_string(chunk, len(chunk))
|
||||||
|
assert vocab[hashed] == 0
|
||||||
|
word = _add(vocab, bacov, <Splitter>NULL, hashed, lex, len(lex), len(lex))
|
||||||
|
for i, lex in enumerate(tokens):
|
||||||
|
token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
|
||||||
|
length = len(token_string)
|
||||||
|
hashed = hash_string(token_string, length)
|
||||||
|
word.tail = _add(vocab, bacov, <Splitter>NULL, hashed, lex, 0, len(lex))
|
||||||
|
word = word.tail
|
||||||
|
|
||||||
|
|
||||||
|
cdef vector[Lexeme_addr] tokenize(Vocab& vocab, dict bacov, Splitter splitter,
|
||||||
|
unicode string) except *:
|
||||||
|
cdef size_t length = len(string)
|
||||||
|
cdef Py_UNICODE* characters = <Py_UNICODE*>string
|
||||||
|
|
||||||
|
cdef size_t i
|
||||||
|
cdef Py_UNICODE c
|
||||||
|
|
||||||
|
cdef vector[Lexeme_addr] tokens = vector[Lexeme_addr]()
|
||||||
|
cdef unicode current = u''
|
||||||
|
cdef Lexeme* token
|
||||||
|
for i in range(length):
|
||||||
|
c = characters[i]
|
||||||
|
if is_whitespace(c):
|
||||||
|
if current:
|
||||||
|
token = <Lexeme*>lookup(vocab, bacov, splitter, -1, current)
|
||||||
|
while token != NULL:
|
||||||
|
tokens.push_back(<Lexeme_addr>token)
|
||||||
|
token = token.tail
|
||||||
|
current = u''
|
||||||
|
else:
|
||||||
|
current += c
|
||||||
|
if current:
|
||||||
|
token = <Lexeme*>lookup(vocab, bacov, splitter, -1, current)
|
||||||
|
while token != NULL:
|
||||||
|
tokens.push_back(<Lexeme_addr>token)
|
||||||
|
token = token.tail
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
|
cdef Lexeme_addr lookup(Vocab& vocab, dict bacov, Splitter find_split, int start,
|
||||||
|
unicode string) except 0:
|
||||||
|
'''Fetch a Lexeme representing a word string. If the word has not been seen,
|
||||||
|
construct one, splitting off any attached punctuation or clitics. A
|
||||||
|
reference to BLANK_WORD is returned for the empty string.
|
||||||
|
|
||||||
|
To specify the boundaries of the word if it has not been seen, use lookup_chunk.
|
||||||
|
'''
|
||||||
|
if string == '':
|
||||||
|
return <Lexeme_addr>&BLANK_WORD
|
||||||
|
cdef size_t length = len(string)
|
||||||
|
cdef StringHash hashed = hash_string(string, length)
|
||||||
|
cdef Lexeme* word_ptr = <Lexeme*>vocab[hashed]
|
||||||
|
if word_ptr == NULL:
|
||||||
|
start = find_split(string, length) if start == -1 else start
|
||||||
|
word_ptr = _add(vocab, bacov, find_split, hashed, string, start, length)
|
||||||
|
return <Lexeme_addr>word_ptr
|
||||||
|
|
||||||
|
|
||||||
cpdef vector[size_t] expand_chunk(size_t addr) except *:
|
cpdef vector[size_t] expand_chunk(size_t addr) except *:
|
||||||
|
@ -11,3 +84,22 @@ cpdef vector[size_t] expand_chunk(size_t addr) except *:
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
|
cdef StringHash hash_string(unicode s, size_t length) except 0:
|
||||||
|
'''Hash unicode with MurmurHash64A'''
|
||||||
|
assert length
|
||||||
|
return MurmurHash64A(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)
|
||||||
|
|
||||||
|
|
||||||
|
cdef unicode unhash(dict bacov, StringHash hash_value):
|
||||||
|
'''Fetch a string from the reverse index, given its hash value.'''
|
||||||
|
return bacov[hash_value]
|
||||||
|
|
||||||
|
|
||||||
|
cdef Lexeme* _add(Vocab& vocab, dict bacov, Splitter find_split, StringHash hashed,
|
||||||
|
unicode string, int split, size_t length) except NULL:
|
||||||
|
assert string
|
||||||
|
assert split <= length
|
||||||
|
word = init_lexeme(vocab, bacov, find_split, string, hashed, split, length)
|
||||||
|
vocab[hashed] = <Lexeme_addr>word
|
||||||
|
bacov[hashed] = string
|
||||||
|
return word
|
||||||
|
|
3
spacy/string_tools.pxd
Normal file
3
spacy/string_tools.pxd
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
cpdef unicode substr(unicode string, int start, int end, size_t length)
|
||||||
|
|
||||||
|
cdef bint is_whitespace(Py_UNICODE c)
|
25
spacy/string_tools.pyx
Normal file
25
spacy/string_tools.pyx
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
cpdef unicode substr(unicode string, int start, int end, size_t length):
|
||||||
|
if end >= length:
|
||||||
|
end = -1
|
||||||
|
if start >= length:
|
||||||
|
start = 0
|
||||||
|
if start <= 0 and end < 0:
|
||||||
|
return string
|
||||||
|
elif start < 0:
|
||||||
|
start = 0
|
||||||
|
elif end < 0:
|
||||||
|
end = length
|
||||||
|
return string[start:end]
|
||||||
|
|
||||||
|
|
||||||
|
cdef bint is_whitespace(Py_UNICODE c):
|
||||||
|
# TODO: Support other unicode spaces
|
||||||
|
# https://www.cs.tut.fi/~jkorpela/chars/spaces.html
|
||||||
|
if c == u' ':
|
||||||
|
return True
|
||||||
|
elif c == u'\n':
|
||||||
|
return True
|
||||||
|
elif c == u'\t':
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
Loading…
Reference in New Issue
Block a user