* Reorganized, moving language-independent stuff to spacy. The functions in spacy ask for the dictionaries and split function on input, but the language-specific modules are curried versions that use the globals

This commit is contained in:
Matthew Honnibal 2014-07-07 04:21:06 +02:00
parent a62c38e1ef
commit d5bef02c72
14 changed files with 466242 additions and 3689 deletions

146129
data/en/case Normal file

File diff suppressed because it is too large Load Diff

316709
data/en/clusters Normal file

File diff suppressed because it is too large Load Diff

93
data/en/tokenization Normal file
View File

@ -0,0 +1,93 @@
# https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions
# 21:09, 25 June 2014
#*--* --
#*---* ---
#*'s 's
ain't are not
aren't are not
can't can not
could've could have
couldn't could not
couldn't've could not have
didn't did not
doesn't does not
don't do not
hadn't had not
hadn't've had not have
hasn't has not
haven't have not
he'd he would
he'd've he would have
he'll he will
he's he 's
how'd he would
how'll he will
how's how 's
I'd I would
I'd've I would have
I'll I will
I'm I am
I've I have
isn't is not
it'd it would
it'd've it would have
it'll it will
it's it 's
let's let 's
mightn't might not
mightn't've might not have
might've might have
mustn't must not
must've must have
needn't need not
not've not have
shan't shall not
she'd she would
she'd've she would have
she'll she will
she's she 's
should've should have
shouldn't should not
shouldn't've should not have
that's that 's
there'd there would
there'd've there would have
there's there is
they'd there would
they'd've they would have
they'll they will
they're they are
they've they have
wasn't was not
we'd we would
we'd've we would have
we'll we will
we're we are
we've we have
weren't were not
what'll what will
what're what are
what's what 's
what've what have
when's when 's
where'd where would
where's where 's
where've where have
who'd who would
who'll who will
who're who are
who's who 's
who've who have
why'll who will
why're why are
why's why is
won't will not
would've would have
wouldn't would not
wouldn't've would not have
you'd you would
you'd've you would have
you'll you will
you're you are
you've you have

File diff suppressed because it is too large Load Diff

View File

@ -1,17 +1,15 @@
from ext.sparsehash cimport dense_hash_map
from spacy.lexeme cimport StringHash
from libcpp.vector cimport vector
from spacy.spacy cimport StringHash
from spacy.spacy cimport Vocab
from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport Lexeme_addr
ctypedef Py_UNICODE* string_ptr
ctypedef size_t Lexeme_addr # For python interop
ctypedef Lexeme* Lexeme_ptr
cdef dense_hash_map[StringHash, Lexeme_ptr] LEXEMES
cdef Vocab VOCAB
cdef dict BACOV
cpdef Lexeme_addr lookup(unicode word) except 0
cpdef Lexeme_addr lookup_chunk(unicode chunk, int start, int end) except 0
cdef StringHash hash_string(unicode s, size_t length) except 0
cpdef vector[Lexeme_addr] tokenize(unicode string) except *
cpdef unicode unhash(StringHash hash_value)

View File

@ -9,211 +9,43 @@ from libc.stdint cimport uint64_t
from libcpp.vector cimport vector
from spacy.lexeme cimport Lexeme
from ext.murmurhash cimport MurmurHash64A
from ext.murmurhash cimport MurmurHash64B
from spacy.string_tools cimport substr
from . import util
cimport spacy
STRINGS = {}
LEXEMES = dense_hash_map[StringHash, Lexeme_ptr]()
LEXEMES.set_empty_key(0)
BACOV = {}
VOCAB = Vocab()
VOCAB.set_empty_key(0)
cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
def load_tokenization(token_rules):
cdef Lexeme* word
cdef StringHash hashed
for chunk, lex, tokens in token_rules:
hashed = hash_string(chunk, len(chunk))
assert LEXEMES[hashed] == NULL
word = _add(hashed, lex, len(lex), len(lex))
for i, lex in enumerate(tokens):
token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
length = len(token_string)
hashed = hash_string(token_string, length)
word.tail = _add(hashed, lex, 0, len(lex))
word = word.tail
load_tokenization(util.read_tokenization('en'))
spacy.load_tokenization(VOCAB, BACOV, util.read_tokenization('en'))
cpdef vector[Lexeme_addr] tokenize(unicode string) except *:
cdef size_t length = len(string)
cdef Py_UNICODE* characters = <Py_UNICODE*>string
return spacy.tokenize(VOCAB, BACOV, find_split, string)
cdef size_t i
cdef Py_UNICODE c
cdef vector[Lexeme_addr] tokens = vector[Lexeme_addr]()
cdef unicode current = u''
cdef Lexeme* token
for i in range(length):
c = characters[i]
if is_whitespace(c):
if current:
token = <Lexeme*>lookup(current)
while token != NULL:
tokens.push_back(<Lexeme_addr>token)
token = token.tail
current = u''
else:
current += c
if current:
token = <Lexeme*>lookup(current)
while token != NULL:
tokens.push_back(<Lexeme_addr>token)
token = token.tail
return tokens
cdef inline bint is_whitespace(Py_UNICODE c):
# TODO: Support other unicode spaces
# https://www.cs.tut.fi/~jkorpela/chars/spaces.html
if c == u' ':
return True
elif c == u'\n':
return True
elif c == u'\t':
return True
else:
return False
cpdef Lexeme_addr lookup(unicode string) except 0:
'''.. function:: enumerate(sequence[, start=0])
Fetch a Lexeme representing a word string. If the word has not been seen,
construct one, splitting off any attached punctuation or clitics. A
reference to BLANK_WORD is returned for the empty string.
To specify the boundaries of the word if it has not been seen, use lookup_chunk.
'''
if string == '':
return <Lexeme_addr>&BLANK_WORD
cdef size_t length = len(string)
cdef StringHash hashed = hash_string(string, length)
cdef Lexeme* word_ptr = LEXEMES[hashed]
cdef size_t n
if word_ptr == NULL:
word_ptr = _add(hashed, string, _find_split(string, length), length)
return <Lexeme_addr>word_ptr
cpdef Lexeme_addr lookup_chunk(unicode string, int start, int end) except 0:
'''Fetch a Lexeme representing a word string. If the word has not been seen,
construct one, given the specified start and end indices. A negative index
significes 0 for start, and the string length for end --- i.e. the string
will not be sliced if start == -1 and end == -1.
A reference to BLANK_WORD is returned for the empty string.
'''
if string == '':
return <Lexeme_addr>&BLANK_WORD
cdef size_t length = len(string)
cdef StringHash hashed = hash_string(string, length)
cdef Lexeme* chunk_ptr = LEXEMES[hashed]
if chunk_ptr == NULL:
chunk_ptr = _add(hashed, string, start, length)
return <Lexeme_addr>chunk_ptr
cdef StringHash hash_string(unicode s, size_t length) except 0:
'''Hash unicode with MurmurHash64A'''
assert length
return MurmurHash64A(<string_ptr>s, length * sizeof(Py_UNICODE), 0)
return spacy.lookup(VOCAB, BACOV, find_split, -1, string)
cpdef unicode unhash(StringHash hash_value):
'''Fetch a string from the reverse index, given its hash value.'''
cdef string_ptr string = STRINGS[hash_value]
if string == NULL:
raise ValueError(hash_value)
return string
return spacy.unhash(BACOV, hash_value)
cdef unicode normalize_word_string(unicode word):
'''Return a normalized version of the word, mapping:
- 4 digit strings into !YEAR
- Other digit strings into !DIGITS
- All other strings into lower-case
'''
cdef vector[StringHash] make_string_views(unicode word):
cdef unicode s
if word.isdigit() and len(word) == 4:
return '!YEAR'
elif word[0].isdigit():
return '!DIGITS'
else:
return word.lower()
return vector[StringHash]()
#if word.isdigit() and len(word) == 4:
# return '!YEAR'
#elif word[0].isdigit():
# return '!DIGITS'
#else:
# return word.lower()
cpdef unicode _substr(unicode string, int start, int end, size_t length):
if end >= length:
end = -1
if start >= length:
start = 0
if start <= 0 and end < 0:
return string
elif start < 0:
start = 0
elif end < 0:
end = length
return string[start:end]
cdef Lexeme* _add(StringHash hashed, unicode string, int split, size_t length) except NULL:
assert string
assert split <= length
word = _init_lexeme(string, hashed, split, length)
LEXEMES[hashed] = word
STRINGS[hashed] = string
return word
cdef Lexeme* _init_lexeme(unicode string, StringHash hashed,
int split, size_t length) except NULL:
assert split <= length
cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
word.first = <Py_UNICODE>(string[0] if string else 0)
word.sic = hashed
cdef unicode tail_string
cdef unicode lex
if split != 0 and split < length:
lex = _substr(string, 0, split, length)
tail_string = _substr(string, split, length, length)
else:
lex = string
tail_string = ''
assert lex
cdef unicode normed = normalize_word_string(lex)
cdef unicode last3 = _substr(string, length - 3, length, length)
assert normed
assert len(normed)
word.lex = hash_string(lex, len(lex))
word.normed = hash_string(normed, len(normed))
word.last3 = hash_string(last3, len(last3))
STRINGS[word.lex] = lex
STRINGS[word.normed] = normed
STRINGS[word.last3] = last3
# These are loaded later
word.prob = 0
word.cluster = 0
word.oft_upper = False
word.oft_title = False
# Now recurse, and deal with the tail
if tail_string:
word.tail = <Lexeme*>lookup(tail_string)
return word
cdef size_t _find_split(unicode word, size_t length):
cdef int find_split(unicode word, size_t length):
cdef int i = 0
# Contractions
if word.endswith("'s"):

File diff suppressed because it is too large Load Diff

View File

@ -1,9 +1,12 @@
from libc.stdint cimport uint64_t
# Put these above import to avoid circular import problem
ctypedef int ClusterID
ctypedef uint64_t StringHash
ctypedef size_t Lexeme_addr
from spacy.spacy cimport Vocab
from spacy.spacy cimport Splitter
cdef struct Lexeme:
StringHash sic # Hash of the original string
@ -20,6 +23,12 @@ cdef struct Lexeme:
Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens
cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
cdef Lexeme* init_lexeme(Vocab vocab, dict bacov, Splitter find_split,
unicode string, StringHash hashed,
int split, size_t length) except NULL
# Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which
# has a conditional to pick out the correct item. This allows safe iteration
# over the Lexeme, via:

View File

@ -2,6 +2,60 @@
Mostly useful from Python-space. From Cython-space, you can just cast to
Lexeme* yourself.
'''
from __future__ import unicode_literals
from spacy.string_tools cimport substr
from spacy.spacy cimport hash_string
from spacy.spacy cimport lookup
from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint64_t
from libcpp.vector cimport vector
cdef Lexeme* init_lexeme(Vocab vocab, dict bacov, Splitter find_split,
unicode string, StringHash hashed,
int split, size_t length) except NULL:
assert split <= length
cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
word.first = <Py_UNICODE>(string[0] if string else 0)
word.sic = hashed
cdef unicode tail_string
cdef unicode lex
if split != 0 and split < length:
lex = substr(string, 0, split, length)
tail_string = substr(string, split, length, length)
else:
lex = string
tail_string = ''
assert lex
#cdef unicode normed = normalize_word_string(lex)
cdef unicode normed = '?'
cdef unicode last3 = substr(string, length - 3, length, length)
assert normed
assert len(normed)
word.lex = hash_string(lex, len(lex))
word.normed = hash_string(normed, len(normed))
word.last3 = hash_string(last3, len(last3))
bacov[word.lex] = lex
bacov[word.normed] = normed
bacov[word.last3] = last3
# These are loaded later
word.prob = 0
word.cluster = 0
word.oft_upper = False
word.oft_title = False
# Now recurse, and deal with the tail
if tail_string:
word.tail = <Lexeme*>lookup(vocab, bacov, find_split, -1, tail_string)
return word
cpdef StringHash sic_of(size_t lex_id) except 0:

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,24 @@
from libcpp.vector cimport vector
from libc.stdint cimport uint64_t
from ext.sparsehash cimport dense_hash_map
# Circular import problems here
ctypedef size_t Lexeme_addr
ctypedef uint64_t StringHash
ctypedef dense_hash_map[StringHash, Lexeme_addr] Vocab
ctypedef int (*Splitter)(unicode word, size_t length)
from spacy.lexeme cimport Lexeme
cdef load_tokenization(Vocab& vocab, dict bacov, token_rules)
cdef vector[Lexeme_addr] tokenize(Vocab& vocab, dict bacov, Splitter splitter,
unicode string) except *
cdef Lexeme_addr lookup(Vocab& vocab, dict bacov, Splitter splitter, int start,
unicode string) except 0
cdef StringHash hash_string(unicode s, size_t length) except 0
cdef unicode unhash(dict bacov, StringHash hash_value)
cpdef vector[size_t] expand_chunk(size_t addr) except *

View File

@ -1,5 +1,78 @@
from __future__ import unicode_literals
from spacy.lexeme cimport Lexeme
from ext.murmurhash cimport MurmurHash64A
from ext.murmurhash cimport MurmurHash64B
from spacy.lexeme cimport init_lexeme
from spacy.lexeme cimport BLANK_WORD
from spacy.string_tools cimport is_whitespace
from . import util
cdef load_tokenization(Vocab& vocab, dict bacov, token_rules):
cdef Lexeme* word
cdef StringHash hashed
for chunk, lex, tokens in token_rules:
hashed = hash_string(chunk, len(chunk))
assert vocab[hashed] == 0
word = _add(vocab, bacov, <Splitter>NULL, hashed, lex, len(lex), len(lex))
for i, lex in enumerate(tokens):
token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
length = len(token_string)
hashed = hash_string(token_string, length)
word.tail = _add(vocab, bacov, <Splitter>NULL, hashed, lex, 0, len(lex))
word = word.tail
cdef vector[Lexeme_addr] tokenize(Vocab& vocab, dict bacov, Splitter splitter,
unicode string) except *:
cdef size_t length = len(string)
cdef Py_UNICODE* characters = <Py_UNICODE*>string
cdef size_t i
cdef Py_UNICODE c
cdef vector[Lexeme_addr] tokens = vector[Lexeme_addr]()
cdef unicode current = u''
cdef Lexeme* token
for i in range(length):
c = characters[i]
if is_whitespace(c):
if current:
token = <Lexeme*>lookup(vocab, bacov, splitter, -1, current)
while token != NULL:
tokens.push_back(<Lexeme_addr>token)
token = token.tail
current = u''
else:
current += c
if current:
token = <Lexeme*>lookup(vocab, bacov, splitter, -1, current)
while token != NULL:
tokens.push_back(<Lexeme_addr>token)
token = token.tail
return tokens
cdef Lexeme_addr lookup(Vocab& vocab, dict bacov, Splitter find_split, int start,
unicode string) except 0:
'''Fetch a Lexeme representing a word string. If the word has not been seen,
construct one, splitting off any attached punctuation or clitics. A
reference to BLANK_WORD is returned for the empty string.
To specify the boundaries of the word if it has not been seen, use lookup_chunk.
'''
if string == '':
return <Lexeme_addr>&BLANK_WORD
cdef size_t length = len(string)
cdef StringHash hashed = hash_string(string, length)
cdef Lexeme* word_ptr = <Lexeme*>vocab[hashed]
if word_ptr == NULL:
start = find_split(string, length) if start == -1 else start
word_ptr = _add(vocab, bacov, find_split, hashed, string, start, length)
return <Lexeme_addr>word_ptr
cpdef vector[size_t] expand_chunk(size_t addr) except *:
@ -11,3 +84,22 @@ cpdef vector[size_t] expand_chunk(size_t addr) except *:
return tokens
cdef StringHash hash_string(unicode s, size_t length) except 0:
'''Hash unicode with MurmurHash64A'''
assert length
return MurmurHash64A(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)
cdef unicode unhash(dict bacov, StringHash hash_value):
'''Fetch a string from the reverse index, given its hash value.'''
return bacov[hash_value]
cdef Lexeme* _add(Vocab& vocab, dict bacov, Splitter find_split, StringHash hashed,
unicode string, int split, size_t length) except NULL:
assert string
assert split <= length
word = init_lexeme(vocab, bacov, find_split, string, hashed, split, length)
vocab[hashed] = <Lexeme_addr>word
bacov[hashed] = string
return word

3
spacy/string_tools.pxd Normal file
View File

@ -0,0 +1,3 @@
cpdef unicode substr(unicode string, int start, int end, size_t length)
cdef bint is_whitespace(Py_UNICODE c)

25
spacy/string_tools.pyx Normal file
View File

@ -0,0 +1,25 @@
cpdef unicode substr(unicode string, int start, int end, size_t length):
if end >= length:
end = -1
if start >= length:
start = 0
if start <= 0 and end < 0:
return string
elif start < 0:
start = 0
elif end < 0:
end = length
return string[start:end]
cdef bint is_whitespace(Py_UNICODE c):
# TODO: Support other unicode spaces
# https://www.cs.tut.fi/~jkorpela/chars/spaces.html
if c == u' ':
return True
elif c == u'\n':
return True
elif c == u'\t':
return True
else:
return False