Merge branch 'feature/group_by' into develop

This commit is contained in:
Matthew Honnibal 2014-07-07 12:51:40 +02:00
commit 9fd085bf90
11 changed files with 273 additions and 227 deletions

View File

@ -54,6 +54,9 @@ exts = [
Extension("spacy.spacy", Extension("spacy.spacy",
["spacy/spacy.pyx", "ext/MurmurHash3.cpp", "ext/MurmurHash2.cpp"], ["spacy/spacy.pyx", "ext/MurmurHash3.cpp", "ext/MurmurHash2.cpp"],
language="c++", include_dirs=includes), language="c++", include_dirs=includes),
Extension("spacy.tokens",
["spacy/tokens.pyx", "ext/MurmurHash3.cpp", "ext/MurmurHash2.cpp"],
language="c++", include_dirs=includes),
Extension("spacy.string_tools", Extension("spacy.string_tools",
["spacy/string_tools.pyx", "ext/MurmurHash3.cpp", "ext/MurmurHash2.cpp"], ["spacy/string_tools.pyx", "ext/MurmurHash3.cpp", "ext/MurmurHash2.cpp"],
language="c++", include_dirs=includes), language="c++", include_dirs=includes),

View File

@ -1,15 +1,18 @@
from libcpp.vector cimport vector from libcpp.vector cimport vector
from spacy.spacy cimport StringHash from spacy.spacy cimport StringHash
from spacy.spacy cimport Vocab
from spacy.lexeme cimport Lexeme from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport Lexeme_addr from spacy.lexeme cimport Lexeme_addr
from spacy.spacy cimport Language
from spacy.tokens cimport Tokens
cdef Vocab* VOCAB
cdef dict BACOV
cdef class English(spacy.Language):
cdef int find_split(self, unicode word, size_t length)
cdef English EN
cpdef Lexeme_addr lookup(unicode word) except 0 cpdef Lexeme_addr lookup(unicode word) except 0
cpdef vector[Lexeme_addr] tokenize(unicode string) except * cpdef Tokens tokenize(unicode string)
cpdef unicode unhash(StringHash hash_value) cpdef unicode unhash(StringHash hash_value)

View File

@ -11,54 +11,27 @@ from libcpp.vector cimport vector
from spacy.lexeme cimport Lexeme from spacy.lexeme cimport Lexeme
from spacy.string_tools cimport substr from spacy.string_tools cimport substr
from . import util from . import util
cimport spacy cimport spacy
BACOV = {}
VOCAB = new Vocab(100000)
VOCAB.set_empty_key(0)
cdef class English(spacy.Language):
spacy.load_tokenization(VOCAB, BACOV, util.read_tokenization('en')) cdef int find_split(self, unicode word, size_t length):
cdef int i = 0
cpdef vector[Lexeme_addr] tokenize(unicode string) except *: # Contractions
return spacy.tokenize(VOCAB, BACOV, find_split, string) if word.endswith("'s"):
return length - 2
# Leading punctuation
cpdef Lexeme_addr lookup(unicode string) except 0: if is_punct(word, 0, length):
return spacy.lookup(VOCAB, BACOV, find_split, -1, string, len(string)) return 1
elif length >= 1:
# Split off all trailing punctuation characters
cpdef unicode unhash(StringHash hash_value): i = 0
return spacy.unhash(BACOV, hash_value) while i < length and not is_punct(word, i, length):
i += 1
return i
cdef vector[StringHash] make_string_views(unicode word):
cdef unicode s
return vector[StringHash]()
#if word.isdigit() and len(word) == 4:
# return '!YEAR'
#elif word[0].isdigit():
# return '!DIGITS'
#else:
# return word.lower()
cdef int find_split(unicode word, size_t length):
cdef int i = 0
# Contractions
if word.endswith("'s"):
return length - 2
# Leading punctuation
if is_punct(word, 0, length):
return 1
elif length >= 1:
# Split off all trailing punctuation characters
i = 0
while i < length and not is_punct(word, i, length):
i += 1
return i
cdef bint is_punct(unicode word, size_t i, size_t length): cdef bint is_punct(unicode word, size_t i, size_t length):
@ -74,4 +47,16 @@ cdef bint is_punct(unicode word, size_t i, size_t length):
return not word[i].isalnum() return not word[i].isalnum()
#spacy.load_browns(VOCAB, BACOV, find_split) EN = English('en')
cpdef Tokens tokenize(unicode string):
return EN.tokenize(string)
cpdef Lexeme_addr lookup(unicode string) except 0:
return EN.lookup(-1, string, len(string))
cpdef unicode unhash(StringHash hash_value):
return EN.unhash(hash_value)

View File

@ -1,15 +1,18 @@
from libcpp.vector cimport vector from libcpp.vector cimport vector
from spacy.spacy cimport StringHash from spacy.spacy cimport StringHash
from spacy.spacy cimport Vocab from spacy.spacy cimport Language
from spacy.lexeme cimport Lexeme from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport Lexeme_addr from spacy.lexeme cimport Lexeme_addr
from spacy.tokens cimport Tokens
cdef Vocab* VOCAB cdef class EnglishPTB(Language):
cdef dict BACOV cdef int find_split(self, unicode word, size_t length)
cdef EnglishPTB EN_PTB
cpdef Lexeme_addr lookup(unicode word) except 0 cpdef Lexeme_addr lookup(unicode word) except 0
cpdef vector[Lexeme_addr] tokenize(unicode string) except * cpdef Tokens tokenize(unicode string)
cpdef unicode unhash(StringHash hash_value) cpdef unicode unhash(StringHash hash_value)

View File

@ -10,55 +10,27 @@ from libcpp.vector cimport vector
from spacy.lexeme cimport Lexeme from spacy.lexeme cimport Lexeme
from spacy.string_tools cimport substr from spacy.string_tools cimport substr
from spacy.spacy cimport Language
from . import util from . import util
cimport spacy cimport spacy
BACOV = {}
VOCAB = new Vocab(100000)
VOCAB.set_empty_key(0)
cdef class EnglishPTB(Language):
spacy.load_tokenization(VOCAB, BACOV, util.read_tokenization('en_ptb')) cdef int find_split(self, unicode word, size_t length):
cdef int i = 0
# Contractions
cpdef vector[Lexeme_addr] tokenize(unicode string) except *: if word.endswith("'s"):
return spacy.tokenize(VOCAB, BACOV, find_split, string) return length - 2
# Leading punctuation
if is_punct(word, 0, length):
cpdef Lexeme_addr lookup(unicode string) except 0: return 1
return spacy.lookup(VOCAB, BACOV, find_split, -1, string, len(string)) elif length >= 1:
# Split off all trailing punctuation characters
i = 0
cpdef unicode unhash(StringHash hash_value): while i < length and not is_punct(word, i, length):
return spacy.unhash(BACOV, hash_value) i += 1
return i
cdef vector[StringHash] make_string_views(unicode word):
cdef unicode s
return vector[StringHash]()
#if word.isdigit() and len(word) == 4:
# return '!YEAR'
#elif word[0].isdigit():
# return '!DIGITS'
#else:
# return word.lower()
cdef int find_split(unicode word, size_t length):
cdef int i = 0
# Contractions
if word.endswith("'s"):
return length - 2
# Leading punctuation
if is_punct(word, 0, length):
return 1
elif length >= 1:
# Split off all trailing punctuation characters
i = 0
while i < length and not is_punct(word, i, length):
i += 1
return i
cdef bint is_punct(unicode word, size_t i, size_t length): cdef bint is_punct(unicode word, size_t i, size_t length):
@ -72,3 +44,17 @@ cdef bint is_punct(unicode word, size_t i, size_t length):
return False return False
punct_chars = set(',;:' + '@#$%&' + '!?' + '[({' + '})]') punct_chars = set(',;:' + '@#$%&' + '!?' + '[({' + '})]')
return word[i] in punct_chars return word[i] in punct_chars
cdef EnglishPTB EN_PTB = EnglishPTB('en_ptb')
cpdef Tokens tokenize(unicode string):
return EN_PTB.tokenize(string)
cpdef Lexeme_addr lookup(unicode string) except 0:
return EN_PTB.lookup(-1, string, len(string))
cpdef unicode unhash(StringHash hash_value):
return EN_PTB.unhash(hash_value)

View File

@ -5,8 +5,7 @@ ctypedef int ClusterID
ctypedef uint64_t StringHash ctypedef uint64_t StringHash
ctypedef size_t Lexeme_addr ctypedef size_t Lexeme_addr
from spacy.spacy cimport Vocab from spacy.spacy cimport Language
from spacy.spacy cimport Splitter
cdef struct Lexeme: cdef struct Lexeme:
StringHash sic # Hash of the original string StringHash sic # Hash of the original string
@ -25,8 +24,7 @@ cdef struct Lexeme:
cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL) cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
cdef Lexeme* init_lexeme(Vocab* vocab, dict bacov, Splitter find_split, cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed,
unicode string, StringHash hashed,
int split, size_t length) int split, size_t length)
# Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which # Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which

View File

@ -6,16 +6,13 @@ Lexeme* yourself.
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy.string_tools cimport substr from spacy.string_tools cimport substr
from spacy.spacy cimport hash_string
from spacy.spacy cimport lookup
from libc.stdlib cimport malloc, calloc, free from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint64_t from libc.stdint cimport uint64_t
from libcpp.vector cimport vector from libcpp.vector cimport vector
cdef Lexeme* init_lexeme(Vocab* vocab, dict bacov, Splitter find_split, cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed,
unicode string, StringHash hashed,
int split, size_t length): int split, size_t length):
assert split <= length assert split <= length
cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme)) cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
@ -39,13 +36,13 @@ cdef Lexeme* init_lexeme(Vocab* vocab, dict bacov, Splitter find_split,
assert normed assert normed
assert len(normed) assert len(normed)
word.lex = hash_string(lex, len(lex)) word.lex = lang.hash_string(lex, len(lex))
word.normed = hash_string(normed, len(normed)) word.normed = lang.hash_string(normed, len(normed))
word.last3 = hash_string(last3, len(last3)) word.last3 = lang.hash_string(last3, len(last3))
bacov[word.lex] = lex lang.bacov[word.lex] = lex
bacov[word.normed] = normed lang.bacov[word.normed] = normed
bacov[word.last3] = last3 lang.bacov[word.last3] = last3
# These are loaded later # These are loaded later
word.prob = 0 word.prob = 0
@ -55,8 +52,7 @@ cdef Lexeme* init_lexeme(Vocab* vocab, dict bacov, Splitter find_split,
# Now recurse, and deal with the tail # Now recurse, and deal with the tail
if tail_string: if tail_string:
word.tail = <Lexeme*>lookup(vocab, bacov, find_split, -1, tail_string, word.tail = <Lexeme*>lang.lookup(-1, tail_string, len(tail_string))
len(tail_string))
return word return word

View File

@ -3,6 +3,7 @@ from libc.stdint cimport uint64_t
from ext.sparsehash cimport dense_hash_map from ext.sparsehash cimport dense_hash_map
# Circular import problems here # Circular import problems here
ctypedef size_t Lexeme_addr ctypedef size_t Lexeme_addr
ctypedef uint64_t StringHash ctypedef uint64_t StringHash
@ -11,15 +12,17 @@ ctypedef int (*Splitter)(unicode word, size_t length)
from spacy.lexeme cimport Lexeme from spacy.lexeme cimport Lexeme
from spacy.tokens cimport Tokens
cdef load_tokenization(Vocab* vocab, dict bacov, token_rules) cdef class Language:
cdef load_browns(Vocab* vocab, dict bacov, Splitter find_split) cdef object name
cdef vector[Lexeme_addr] tokenize(Vocab* vocab, dict bacov, Splitter splitter, cdef Vocab* vocab
unicode string) except * cdef dict bacov
cdef Lexeme_addr lookup(Vocab* vocab, dict bacov, Splitter splitter, int start, cdef int find_split(self, unicode word, size_t length)
Py_UNICODE* string, size_t length) except 0
cdef StringHash hash_string(Py_UNICODE* s, size_t length) nogil
cdef unicode unhash(dict bacov, StringHash hash_value)
cdef Lexeme_addr lookup(self, int split, Py_UNICODE* string, size_t length) except 0
cdef StringHash hash_string(self, Py_UNICODE* string, size_t length) except 0
cdef unicode unhash(self, StringHash hashed)
cpdef vector[size_t] expand_chunk(size_t addr) except * cpdef Tokens tokenize(self, unicode text)
cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length)

View File

@ -16,76 +16,112 @@ from os import path
cimport cython cimport cython
cdef load_tokenization(Vocab* vocab, dict bacov, token_rules): cdef class Language:
cdef Lexeme* word def __cinit__(self, name):
cdef StringHash hashed self.name = name
for chunk, lex, tokens in token_rules: self.bacov = {}
hashed = hash_string(chunk, len(chunk)) self.vocab = new Vocab()
assert vocab[0][hashed] == 0, chunk self.vocab[0].set_empty_key(0)
word = _add(vocab, bacov, <Splitter>NULL, hashed, lex, len(lex), len(lex)) self.load_tokenization(util.read_tokenization(name))
for i, lex in enumerate(tokens):
token_string = '%s:@:%d:@:%s' % (chunk, i, lex) def load_tokenization(self, token_rules=None):
length = len(token_string) cdef Lexeme* word
hashed = hash_string(token_string, length) cdef StringHash hashed
word.tail = _add(vocab, bacov, <Splitter>NULL, hashed, lex, 0, len(lex)) for chunk, lex, tokens in token_rules:
word = word.tail hashed = self.hash_string(chunk, len(chunk))
word = self._add(hashed, lex, len(lex), len(lex))
for i, lex in enumerate(tokens):
token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
length = len(token_string)
hashed = self.hash_string(token_string, length)
word.tail = self._add(hashed, lex, 0, len(lex))
word = word.tail
def load_clusters(self):
cdef Lexeme* w
data_dir = path.join(path.dirname(__file__), '..', 'data', 'en')
case_stats = util.load_case_stats(data_dir)
brown_loc = path.join(data_dir, 'clusters')
cdef size_t start
cdef int end
with util.utf8open(brown_loc) as browns_file:
for i, line in enumerate(browns_file):
cluster_str, token_string, freq_str = line.split()
# Decode as a little-endian string, so that we can do & 15 to get
# the first 4 bits. See redshift._parse_features.pyx
cluster = int(cluster_str[::-1], 2)
upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0))
hashed = self.hash_string(token_string, len(token_string))
word = self._add(hashed, token_string,
len(token_string), len(token_string))
cdef StringHash hash_string(self, Py_UNICODE* s, size_t length) except 0:
'''Hash unicode with MurmurHash64A'''
return MurmurHash64A(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)
cdef unicode unhash(self, StringHash hash_value):
'''Fetch a string from the reverse index, given its hash value.'''
return self.bacov[hash_value]
cdef Lexeme_addr lookup(self, int start, Py_UNICODE* string, size_t length) except 0:
'''Fetch a Lexeme representing a word string. If the word has not been seen,
construct one, splitting off any attached punctuation or clitics. A
reference to BLANK_WORD is returned for the empty string.
To specify the boundaries of the word if it has not been seen, use lookup_chunk.
'''
if length == 0:
return <Lexeme_addr>&BLANK_WORD
cdef StringHash hashed = self.hash_string(string, length)
cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
if word_ptr == NULL:
start = self.find_split(string, length) if start == -1 else start
word_ptr = self._add(hashed, string, start, length)
return <Lexeme_addr>word_ptr
cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
word = init_lexeme(self, string, hashed, split, length)
self.vocab[0][hashed] = <Lexeme_addr>word
self.bacov[hashed] = string
return word
cpdef Tokens tokenize(self, unicode string):
cdef size_t length = len(string)
cdef Py_UNICODE* characters = <Py_UNICODE*>string
cdef size_t i
cdef Py_UNICODE c
cdef Tokens tokens = Tokens(self)
cdef Py_UNICODE* current = <Py_UNICODE*>calloc(len(string), sizeof(Py_UNICODE))
cdef size_t word_len = 0
cdef Lexeme* token
for i in range(length):
c = characters[i]
if _is_whitespace(c):
if word_len != 0:
token = <Lexeme*>self.lookup(-1, current, word_len)
while token != NULL:
tokens.append(<Lexeme_addr>token)
token = token.tail
for j in range(word_len+1):
current[j] = 0
word_len = 0
else:
current[word_len] = c
word_len += 1
if word_len != 0:
token = <Lexeme*>self.lookup(-1, current, word_len)
while token != NULL:
tokens.append(<Lexeme_addr>token)
token = token.tail
free(current)
return tokens
cdef int find_split(self, unicode word, size_t length):
return -1
cdef load_browns(Vocab* vocab, dict bacov, Splitter find_split):
cdef Lexeme* w
data_dir = path.join(path.dirname(__file__), '..', 'data', 'en')
case_stats = util.load_case_stats(data_dir)
brown_loc = path.join(data_dir, 'clusters')
cdef size_t start
cdef int end
with util.utf8open(brown_loc) as browns_file:
for i, line in enumerate(browns_file):
cluster_str, token_string, freq_str = line.split()
# Decode as a little-endian string, so that we can do & 15 to get
# the first 4 bits. See redshift._parse_features.pyx
cluster = int(cluster_str[::-1], 2)
upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0))
start = 0
end = -1
hashed = hash_string(token_string, len(token_string))
word = _add(vocab, bacov, find_split, hashed, token_string,
len(token_string), len(token_string))
cdef vector[Lexeme_addr] tokenize(Vocab* vocab, dict bacov, Splitter splitter,
unicode string) except *:
cdef size_t length = len(string)
cdef Py_UNICODE* characters = <Py_UNICODE*>string
cdef size_t i
cdef Py_UNICODE c
cdef vector[Lexeme_addr] tokens = vector[Lexeme_addr]()
cdef Py_UNICODE* current = <Py_UNICODE*>calloc(len(string), sizeof(Py_UNICODE))
cdef size_t word_len = 0
cdef Lexeme* token
for i in range(length):
c = characters[i]
if _is_whitespace(c):
if word_len != 0:
token = <Lexeme*>lookup(vocab, bacov, splitter, -1, current, word_len)
while token != NULL:
tokens.push_back(<Lexeme_addr>token)
token = token.tail
for j in range(word_len+1):
current[j] = 0
word_len = 0
else:
current[word_len] = c
word_len += 1
if word_len != 0:
token = <Lexeme*>lookup(vocab, bacov, splitter, -1, current, word_len)
while token != NULL:
tokens.push_back(<Lexeme_addr>token)
token = token.tail
free(current)
return tokens
cdef inline bint _is_whitespace(Py_UNICODE c) nogil: cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
if c == ' ': if c == ' ':
@ -97,23 +133,6 @@ cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
else: else:
return False return False
cdef Lexeme_addr lookup(Vocab* vocab, dict bacov, Splitter find_split, int start,
Py_UNICODE* string, size_t length) except 0:
'''Fetch a Lexeme representing a word string. If the word has not been seen,
construct one, splitting off any attached punctuation or clitics. A
reference to BLANK_WORD is returned for the empty string.
To specify the boundaries of the word if it has not been seen, use lookup_chunk.
'''
if length == 0:
return <Lexeme_addr>&BLANK_WORD
cdef StringHash hashed = hash_string(string, length)
cdef Lexeme* word_ptr = <Lexeme*>vocab[0][hashed]
if word_ptr == NULL:
start = find_split(string, length) if start == -1 else start
word_ptr = _add(vocab, bacov, find_split, hashed, string, start, length)
return <Lexeme_addr>word_ptr
cpdef vector[size_t] expand_chunk(size_t addr) except *: cpdef vector[size_t] expand_chunk(size_t addr) except *:
cdef vector[size_t] tokens = vector[size_t]() cdef vector[size_t] tokens = vector[size_t]()
@ -122,22 +141,3 @@ cpdef vector[size_t] expand_chunk(size_t addr) except *:
tokens.push_back(<size_t>word) tokens.push_back(<size_t>word)
word = word.tail word = word.tail
return tokens return tokens
cdef StringHash hash_string(Py_UNICODE* s, size_t length) nogil:
'''Hash unicode with MurmurHash64A'''
return MurmurHash64A(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)
cdef unicode unhash(dict bacov, StringHash hash_value):
'''Fetch a string from the reverse index, given its hash value.'''
return bacov[hash_value]
@cython.nonecheck(False)
cdef Lexeme* _add(Vocab* vocab, dict bacov, Splitter find_split, StringHash hashed,
unicode string, int split, size_t length):
word = init_lexeme(vocab, bacov, find_split, string, hashed, split, length)
vocab[0][hashed] = <Lexeme_addr>word
bacov[hashed] = string
return word

21
spacy/tokens.pxd Normal file
View File

@ -0,0 +1,21 @@
from libcpp.vector cimport vector
from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport Lexeme_addr
from cython.operator cimport dereference as deref
from spacy.spacy cimport Language
cdef enum Field:
lex
cdef class Tokens:
cdef Language lang
cdef vector[Lexeme_addr]* vctr
cdef size_t length
cpdef int append(self, Lexeme_addr token)
cpdef int extend(self, Tokens other) except -1
cpdef list group_by(self, Field attr)
cpdef dict count_by(self, Field attr)

48
spacy/tokens.pyx Normal file
View File

@ -0,0 +1,48 @@
from cython.operator cimport dereference as deref
from cython.operator cimport preincrement as inc
cdef class Tokens:
def __cinit__(self, Language lang):
self.lang = lang
self.vctr = new vector[Lexeme_addr]()
self.length = 0
def __dealloc__(self):
del self.vctr
def __iter__(self):
cdef vector[Lexeme_addr].iterator it = self.vctr[0].begin()
while it != self.vctr[0].end():
yield deref(it)
inc(it)
def __getitem__(self, size_t idx):
return self.vctr[0].at(idx)
def __len__(self):
return self.length
cpdef int append(self, Lexeme_addr token):
self.vctr[0].push_back(token)
self.length += 1
cpdef int extend(self, Tokens other) except -1:
cdef Lexeme_addr el
for el in other:
self.append(el)
cpdef list group_by(self, Field attr):
pass
cpdef dict count_by(self, Field attr):
counts = {}
cdef Lexeme_addr t
cdef Lexeme* word
for t in self.vctr[0]:
word = <Lexeme*>t
if word.lex not in counts:
counts[word.lex] = 0
counts[word.lex] += 1
return counts