* Reorganized code to accomodate Tokens class. Need string views before group_by and count_by can be done well.

This commit is contained in:
Matthew Honnibal 2014-07-07 12:47:21 +02:00
parent 6da8010577
commit f1bcbd4c4e
10 changed files with 270 additions and 227 deletions

View File

@ -1,15 +1,18 @@
from libcpp.vector cimport vector
from spacy.spacy cimport StringHash
from spacy.spacy cimport Vocab
from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport Lexeme_addr
from spacy.spacy cimport Language
from spacy.tokens cimport Tokens
cdef Vocab* VOCAB
cdef dict BACOV
cdef class English(spacy.Language):
cdef int find_split(self, unicode word, size_t length)
cdef English EN
cpdef Lexeme_addr lookup(unicode word) except 0
cpdef vector[Lexeme_addr] tokenize(unicode string) except *
cpdef Tokens tokenize(unicode string)
cpdef unicode unhash(StringHash hash_value)

View File

@ -11,54 +11,27 @@ from libcpp.vector cimport vector
from spacy.lexeme cimport Lexeme
from spacy.string_tools cimport substr
from . import util
cimport spacy
BACOV = {}
VOCAB = new Vocab(100000)
VOCAB.set_empty_key(0)
spacy.load_tokenization(VOCAB, BACOV, util.read_tokenization('en'))
cpdef vector[Lexeme_addr] tokenize(unicode string) except *:
return spacy.tokenize(VOCAB, BACOV, find_split, string)
cpdef Lexeme_addr lookup(unicode string) except 0:
return spacy.lookup(VOCAB, BACOV, find_split, -1, string, len(string))
cpdef unicode unhash(StringHash hash_value):
return spacy.unhash(BACOV, hash_value)
cdef vector[StringHash] make_string_views(unicode word):
cdef unicode s
return vector[StringHash]()
#if word.isdigit() and len(word) == 4:
# return '!YEAR'
#elif word[0].isdigit():
# return '!DIGITS'
#else:
# return word.lower()
cdef int find_split(unicode word, size_t length):
cdef int i = 0
# Contractions
if word.endswith("'s"):
return length - 2
# Leading punctuation
if is_punct(word, 0, length):
return 1
elif length >= 1:
# Split off all trailing punctuation characters
i = 0
while i < length and not is_punct(word, i, length):
i += 1
return i
cdef class English(spacy.Language):
cdef int find_split(self, unicode word, size_t length):
cdef int i = 0
# Contractions
if word.endswith("'s"):
return length - 2
# Leading punctuation
if is_punct(word, 0, length):
return 1
elif length >= 1:
# Split off all trailing punctuation characters
i = 0
while i < length and not is_punct(word, i, length):
i += 1
return i
cdef bint is_punct(unicode word, size_t i, size_t length):
@ -74,4 +47,16 @@ cdef bint is_punct(unicode word, size_t i, size_t length):
return not word[i].isalnum()
#spacy.load_browns(VOCAB, BACOV, find_split)
EN = English('en')
cpdef Tokens tokenize(unicode string):
return EN.tokenize(string)
cpdef Lexeme_addr lookup(unicode string) except 0:
return EN.lookup(-1, string, len(string))
cpdef unicode unhash(StringHash hash_value):
return EN.unhash(hash_value)

View File

@ -1,15 +1,18 @@
from libcpp.vector cimport vector
from spacy.spacy cimport StringHash
from spacy.spacy cimport Vocab
from spacy.spacy cimport Language
from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport Lexeme_addr
from spacy.tokens cimport Tokens
cdef Vocab* VOCAB
cdef dict BACOV
cdef class EnglishPTB(Language):
cdef int find_split(self, unicode word, size_t length)
cdef EnglishPTB EN_PTB
cpdef Lexeme_addr lookup(unicode word) except 0
cpdef vector[Lexeme_addr] tokenize(unicode string) except *
cpdef Tokens tokenize(unicode string)
cpdef unicode unhash(StringHash hash_value)

View File

@ -10,55 +10,27 @@ from libcpp.vector cimport vector
from spacy.lexeme cimport Lexeme
from spacy.string_tools cimport substr
from spacy.spacy cimport Language
from . import util
cimport spacy
BACOV = {}
VOCAB = new Vocab(100000)
VOCAB.set_empty_key(0)
spacy.load_tokenization(VOCAB, BACOV, util.read_tokenization('en_ptb'))
cpdef vector[Lexeme_addr] tokenize(unicode string) except *:
return spacy.tokenize(VOCAB, BACOV, find_split, string)
cpdef Lexeme_addr lookup(unicode string) except 0:
return spacy.lookup(VOCAB, BACOV, find_split, -1, string, len(string))
cpdef unicode unhash(StringHash hash_value):
return spacy.unhash(BACOV, hash_value)
cdef vector[StringHash] make_string_views(unicode word):
cdef unicode s
return vector[StringHash]()
#if word.isdigit() and len(word) == 4:
# return '!YEAR'
#elif word[0].isdigit():
# return '!DIGITS'
#else:
# return word.lower()
cdef int find_split(unicode word, size_t length):
cdef int i = 0
# Contractions
if word.endswith("'s"):
return length - 2
# Leading punctuation
if is_punct(word, 0, length):
return 1
elif length >= 1:
# Split off all trailing punctuation characters
i = 0
while i < length and not is_punct(word, i, length):
i += 1
return i
cdef class EnglishPTB(Language):
cdef int find_split(self, unicode word, size_t length):
cdef int i = 0
# Contractions
if word.endswith("'s"):
return length - 2
# Leading punctuation
if is_punct(word, 0, length):
return 1
elif length >= 1:
# Split off all trailing punctuation characters
i = 0
while i < length and not is_punct(word, i, length):
i += 1
return i
cdef bint is_punct(unicode word, size_t i, size_t length):
@ -72,3 +44,17 @@ cdef bint is_punct(unicode word, size_t i, size_t length):
return False
punct_chars = set(',;:' + '@#$%&' + '!?' + '[({' + '})]')
return word[i] in punct_chars
cdef EnglishPTB EN_PTB = EnglishPTB('en_ptb')
cpdef Tokens tokenize(unicode string):
return EN_PTB.tokenize(string)
cpdef Lexeme_addr lookup(unicode string) except 0:
return EN_PTB.lookup(-1, string, len(string))
cpdef unicode unhash(StringHash hash_value):
return EN_PTB.unhash(hash_value)

View File

@ -5,8 +5,7 @@ ctypedef int ClusterID
ctypedef uint64_t StringHash
ctypedef size_t Lexeme_addr
from spacy.spacy cimport Vocab
from spacy.spacy cimport Splitter
from spacy.spacy cimport Language
cdef struct Lexeme:
StringHash sic # Hash of the original string
@ -25,8 +24,7 @@ cdef struct Lexeme:
cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
cdef Lexeme* init_lexeme(Vocab* vocab, dict bacov, Splitter find_split,
unicode string, StringHash hashed,
cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed,
int split, size_t length)
# Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which

View File

@ -6,16 +6,13 @@ Lexeme* yourself.
from __future__ import unicode_literals
from spacy.string_tools cimport substr
from spacy.spacy cimport hash_string
from spacy.spacy cimport lookup
from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint64_t
from libcpp.vector cimport vector
cdef Lexeme* init_lexeme(Vocab* vocab, dict bacov, Splitter find_split,
unicode string, StringHash hashed,
cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed,
int split, size_t length):
assert split <= length
cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
@ -39,13 +36,13 @@ cdef Lexeme* init_lexeme(Vocab* vocab, dict bacov, Splitter find_split,
assert normed
assert len(normed)
word.lex = hash_string(lex, len(lex))
word.normed = hash_string(normed, len(normed))
word.last3 = hash_string(last3, len(last3))
word.lex = lang.hash_string(lex, len(lex))
word.normed = lang.hash_string(normed, len(normed))
word.last3 = lang.hash_string(last3, len(last3))
bacov[word.lex] = lex
bacov[word.normed] = normed
bacov[word.last3] = last3
lang.bacov[word.lex] = lex
lang.bacov[word.normed] = normed
lang.bacov[word.last3] = last3
# These are loaded later
word.prob = 0
@ -55,8 +52,7 @@ cdef Lexeme* init_lexeme(Vocab* vocab, dict bacov, Splitter find_split,
# Now recurse, and deal with the tail
if tail_string:
word.tail = <Lexeme*>lookup(vocab, bacov, find_split, -1, tail_string,
len(tail_string))
word.tail = <Lexeme*>lang.lookup(-1, tail_string, len(tail_string))
return word

View File

@ -3,6 +3,7 @@ from libc.stdint cimport uint64_t
from ext.sparsehash cimport dense_hash_map
# Circular import problems here
ctypedef size_t Lexeme_addr
ctypedef uint64_t StringHash
@ -11,15 +12,17 @@ ctypedef int (*Splitter)(unicode word, size_t length)
from spacy.lexeme cimport Lexeme
from spacy.tokens cimport Tokens
cdef load_tokenization(Vocab* vocab, dict bacov, token_rules)
cdef load_browns(Vocab* vocab, dict bacov, Splitter find_split)
cdef vector[Lexeme_addr] tokenize(Vocab* vocab, dict bacov, Splitter splitter,
unicode string) except *
cdef Lexeme_addr lookup(Vocab* vocab, dict bacov, Splitter splitter, int start,
Py_UNICODE* string, size_t length) except 0
cdef StringHash hash_string(Py_UNICODE* s, size_t length) nogil
cdef unicode unhash(dict bacov, StringHash hash_value)
cpdef vector[size_t] expand_chunk(size_t addr) except *
cdef class Language:
cdef object name
cdef Vocab* vocab
cdef dict bacov
cdef int find_split(self, unicode word, size_t length)
cdef Lexeme_addr lookup(self, int split, Py_UNICODE* string, size_t length) except 0
cdef StringHash hash_string(self, Py_UNICODE* string, size_t length) except 0
cdef unicode unhash(self, StringHash hashed)
cpdef Tokens tokenize(self, unicode text)
cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length)

View File

@ -16,76 +16,112 @@ from os import path
cimport cython
cdef load_tokenization(Vocab* vocab, dict bacov, token_rules):
cdef Lexeme* word
cdef StringHash hashed
for chunk, lex, tokens in token_rules:
hashed = hash_string(chunk, len(chunk))
assert vocab[0][hashed] == 0, chunk
word = _add(vocab, bacov, <Splitter>NULL, hashed, lex, len(lex), len(lex))
for i, lex in enumerate(tokens):
token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
length = len(token_string)
hashed = hash_string(token_string, length)
word.tail = _add(vocab, bacov, <Splitter>NULL, hashed, lex, 0, len(lex))
word = word.tail
cdef class Language:
def __cinit__(self, name):
self.name = name
self.bacov = {}
self.vocab = new Vocab()
self.vocab[0].set_empty_key(0)
self.load_tokenization(util.read_tokenization(name))
def load_tokenization(self, token_rules=None):
cdef Lexeme* word
cdef StringHash hashed
for chunk, lex, tokens in token_rules:
hashed = self.hash_string(chunk, len(chunk))
word = self._add(hashed, lex, len(lex), len(lex))
for i, lex in enumerate(tokens):
token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
length = len(token_string)
hashed = self.hash_string(token_string, length)
word.tail = self._add(hashed, lex, 0, len(lex))
word = word.tail
def load_clusters(self):
cdef Lexeme* w
data_dir = path.join(path.dirname(__file__), '..', 'data', 'en')
case_stats = util.load_case_stats(data_dir)
brown_loc = path.join(data_dir, 'clusters')
cdef size_t start
cdef int end
with util.utf8open(brown_loc) as browns_file:
for i, line in enumerate(browns_file):
cluster_str, token_string, freq_str = line.split()
# Decode as a little-endian string, so that we can do & 15 to get
# the first 4 bits. See redshift._parse_features.pyx
cluster = int(cluster_str[::-1], 2)
upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0))
hashed = self.hash_string(token_string, len(token_string))
word = self._add(hashed, token_string,
len(token_string), len(token_string))
cdef StringHash hash_string(self, Py_UNICODE* s, size_t length) except 0:
'''Hash unicode with MurmurHash64A'''
return MurmurHash64A(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)
cdef unicode unhash(self, StringHash hash_value):
'''Fetch a string from the reverse index, given its hash value.'''
return self.bacov[hash_value]
cdef Lexeme_addr lookup(self, int start, Py_UNICODE* string, size_t length) except 0:
'''Fetch a Lexeme representing a word string. If the word has not been seen,
construct one, splitting off any attached punctuation or clitics. A
reference to BLANK_WORD is returned for the empty string.
To specify the boundaries of the word if it has not been seen, use lookup_chunk.
'''
if length == 0:
return <Lexeme_addr>&BLANK_WORD
cdef StringHash hashed = self.hash_string(string, length)
cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
if word_ptr == NULL:
start = self.find_split(string, length) if start == -1 else start
word_ptr = self._add(hashed, string, start, length)
return <Lexeme_addr>word_ptr
cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
word = init_lexeme(self, string, hashed, split, length)
self.vocab[0][hashed] = <Lexeme_addr>word
self.bacov[hashed] = string
return word
cpdef Tokens tokenize(self, unicode string):
cdef size_t length = len(string)
cdef Py_UNICODE* characters = <Py_UNICODE*>string
cdef size_t i
cdef Py_UNICODE c
cdef Tokens tokens = Tokens(self)
cdef Py_UNICODE* current = <Py_UNICODE*>calloc(len(string), sizeof(Py_UNICODE))
cdef size_t word_len = 0
cdef Lexeme* token
for i in range(length):
c = characters[i]
if _is_whitespace(c):
if word_len != 0:
token = <Lexeme*>self.lookup(-1, current, word_len)
while token != NULL:
tokens.append(<Lexeme_addr>token)
token = token.tail
for j in range(word_len+1):
current[j] = 0
word_len = 0
else:
current[word_len] = c
word_len += 1
if word_len != 0:
token = <Lexeme*>self.lookup(-1, current, word_len)
while token != NULL:
tokens.append(<Lexeme_addr>token)
token = token.tail
free(current)
return tokens
cdef int find_split(self, unicode word, size_t length):
return -1
cdef load_browns(Vocab* vocab, dict bacov, Splitter find_split):
cdef Lexeme* w
data_dir = path.join(path.dirname(__file__), '..', 'data', 'en')
case_stats = util.load_case_stats(data_dir)
brown_loc = path.join(data_dir, 'clusters')
cdef size_t start
cdef int end
with util.utf8open(brown_loc) as browns_file:
for i, line in enumerate(browns_file):
cluster_str, token_string, freq_str = line.split()
# Decode as a little-endian string, so that we can do & 15 to get
# the first 4 bits. See redshift._parse_features.pyx
cluster = int(cluster_str[::-1], 2)
upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0))
start = 0
end = -1
hashed = hash_string(token_string, len(token_string))
word = _add(vocab, bacov, find_split, hashed, token_string,
len(token_string), len(token_string))
cdef vector[Lexeme_addr] tokenize(Vocab* vocab, dict bacov, Splitter splitter,
unicode string) except *:
cdef size_t length = len(string)
cdef Py_UNICODE* characters = <Py_UNICODE*>string
cdef size_t i
cdef Py_UNICODE c
cdef vector[Lexeme_addr] tokens = vector[Lexeme_addr]()
cdef Py_UNICODE* current = <Py_UNICODE*>calloc(len(string), sizeof(Py_UNICODE))
cdef size_t word_len = 0
cdef Lexeme* token
for i in range(length):
c = characters[i]
if _is_whitespace(c):
if word_len != 0:
token = <Lexeme*>lookup(vocab, bacov, splitter, -1, current, word_len)
while token != NULL:
tokens.push_back(<Lexeme_addr>token)
token = token.tail
for j in range(word_len+1):
current[j] = 0
word_len = 0
else:
current[word_len] = c
word_len += 1
if word_len != 0:
token = <Lexeme*>lookup(vocab, bacov, splitter, -1, current, word_len)
while token != NULL:
tokens.push_back(<Lexeme_addr>token)
token = token.tail
free(current)
return tokens
cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
if c == ' ':
@ -97,23 +133,6 @@ cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
else:
return False
cdef Lexeme_addr lookup(Vocab* vocab, dict bacov, Splitter find_split, int start,
Py_UNICODE* string, size_t length) except 0:
'''Fetch a Lexeme representing a word string. If the word has not been seen,
construct one, splitting off any attached punctuation or clitics. A
reference to BLANK_WORD is returned for the empty string.
To specify the boundaries of the word if it has not been seen, use lookup_chunk.
'''
if length == 0:
return <Lexeme_addr>&BLANK_WORD
cdef StringHash hashed = hash_string(string, length)
cdef Lexeme* word_ptr = <Lexeme*>vocab[0][hashed]
if word_ptr == NULL:
start = find_split(string, length) if start == -1 else start
word_ptr = _add(vocab, bacov, find_split, hashed, string, start, length)
return <Lexeme_addr>word_ptr
cpdef vector[size_t] expand_chunk(size_t addr) except *:
cdef vector[size_t] tokens = vector[size_t]()
@ -122,22 +141,3 @@ cpdef vector[size_t] expand_chunk(size_t addr) except *:
tokens.push_back(<size_t>word)
word = word.tail
return tokens
cdef StringHash hash_string(Py_UNICODE* s, size_t length) nogil:
'''Hash unicode with MurmurHash64A'''
return MurmurHash64A(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)
cdef unicode unhash(dict bacov, StringHash hash_value):
'''Fetch a string from the reverse index, given its hash value.'''
return bacov[hash_value]
@cython.nonecheck(False)
cdef Lexeme* _add(Vocab* vocab, dict bacov, Splitter find_split, StringHash hashed,
unicode string, int split, size_t length):
word = init_lexeme(vocab, bacov, find_split, string, hashed, split, length)
vocab[0][hashed] = <Lexeme_addr>word
bacov[hashed] = string
return word

21
spacy/tokens.pxd Normal file
View File

@ -0,0 +1,21 @@
from libcpp.vector cimport vector
from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport Lexeme_addr
from cython.operator cimport dereference as deref
from spacy.spacy cimport Language
cdef enum Field:
lex
cdef class Tokens:
cdef Language lang
cdef vector[Lexeme_addr]* vctr
cdef size_t length
cpdef int append(self, Lexeme_addr token)
cpdef int extend(self, Tokens other) except -1
cpdef list group_by(self, Field attr)
cpdef dict count_by(self, Field attr)

48
spacy/tokens.pyx Normal file
View File

@ -0,0 +1,48 @@
from cython.operator cimport dereference as deref
from cython.operator cimport preincrement as inc
cdef class Tokens:
def __cinit__(self, Language lang):
self.lang = lang
self.vctr = new vector[Lexeme_addr]()
self.length = 0
def __dealloc__(self):
del self.vctr
def __iter__(self):
cdef vector[Lexeme_addr].iterator it = self.vctr[0].begin()
while it != self.vctr[0].end():
yield deref(it)
inc(it)
def __getitem__(self, size_t idx):
return self.vctr[0].at(idx)
def __len__(self):
return self.length
cpdef int append(self, Lexeme_addr token):
self.vctr[0].push_back(token)
self.length += 1
cpdef int extend(self, Tokens other) except -1:
cdef Lexeme_addr el
for el in other:
self.append(el)
cpdef list group_by(self, Field attr):
pass
cpdef dict count_by(self, Field attr):
counts = {}
cdef Lexeme_addr t
cdef Lexeme* word
for t in self.vctr[0]:
word = <Lexeme*>t
if word.lex not in counts:
counts[word.lex] = 0
counts[word.lex] += 1
return counts