* Refactor for string view features. Working on setting up flags and enums.

This commit is contained in:
Matthew Honnibal 2014-07-07 16:58:48 +02:00
parent 9fd085bf90
commit 057c21969b
11 changed files with 167 additions and 79 deletions

View File

@ -1,8 +1,8 @@
from libcpp.vector cimport vector from libcpp.vector cimport vector
from spacy.spacy cimport StringHash from spacy.spacy cimport StringHash
from spacy.lexeme cimport Lexeme from spacy.spacy cimport Lexeme
from spacy.lexeme cimport Lexeme_addr from spacy.spacy cimport Lexeme_addr
from spacy.spacy cimport Language from spacy.spacy cimport Language
from spacy.tokens cimport Tokens from spacy.tokens cimport Tokens

View File

@ -9,7 +9,6 @@ from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint64_t from libc.stdint cimport uint64_t
from libcpp.vector cimport vector from libcpp.vector cimport vector
from spacy.lexeme cimport Lexeme
from spacy.string_tools cimport substr from spacy.string_tools cimport substr
from . import util from . import util

View File

@ -2,8 +2,8 @@ from libcpp.vector cimport vector
from spacy.spacy cimport StringHash from spacy.spacy cimport StringHash
from spacy.spacy cimport Language from spacy.spacy cimport Language
from spacy.lexeme cimport Lexeme from spacy.spacy cimport Lexeme
from spacy.lexeme cimport Lexeme_addr from spacy.spacy cimport Lexeme_addr
from spacy.tokens cimport Tokens from spacy.tokens cimport Tokens

View File

@ -4,11 +4,11 @@ boldly assume no collisions.
''' '''
from __future__ import unicode_literals from __future__ import unicode_literals
from libc.stdlib cimport malloc, calloc, free from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint64_t from libc.stdint cimport uint64_t
from libcpp.vector cimport vector from libcpp.vector cimport vector
from spacy.lexeme cimport Lexeme
from spacy.string_tools cimport substr from spacy.string_tools cimport substr
from spacy.spacy cimport Language from spacy.spacy cimport Language
from . import util from . import util

View File

@ -4,29 +4,48 @@ from libc.stdint cimport uint64_t
ctypedef int ClusterID ctypedef int ClusterID
ctypedef uint64_t StringHash ctypedef uint64_t StringHash
ctypedef size_t Lexeme_addr ctypedef size_t Lexeme_addr
ctypedef char Bits8
ctypedef uint64_t Bits64
from spacy.spacy cimport Language from spacy.spacy cimport Language
cdef struct Orthography:
StringHash last3
StringHash shape
StringHash norm
Py_UNICODE first
Bits8 flags
cdef struct Distribution:
double prob
ClusterID cluster
Bits64 tagdict
Bits8 flags
cdef struct Lexeme: cdef struct Lexeme:
StringHash sic # Hash of the original string StringHash sic # Hash of the original string
StringHash lex # Hash of the word, with punctuation and clitics split off StringHash lex # Hash of the word, with punctuation and clitics split off
StringHash normed # Hash of the normalized version of lex
StringHash last3 # Last 3 characters of the token
Py_UNICODE first # First character of the token
double prob # What is the log probability of the lex value? Distribution* dist # Distribution info, lazy loaded
ClusterID cluster # Brown cluster of the token Orthography* orth # Extra orthographic views
bint oft_upper # Is the lowered version of the lex value often in all caps?
bint oft_title # Is the lowered version of the lex value often title-cased?
Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens
cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL) cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL)
cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed,
int split, size_t length) cpdef StringHash lex_of(size_t lex_id) except 0
cpdef StringHash norm_of(size_t lex_id) except 0
#cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed,
# int split, size_t length)
# Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which # Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which
# has a conditional to pick out the correct item. This allows safe iteration # has a conditional to pick out the correct item. This allows safe iteration
# over the Lexeme, via: # over the Lexeme, via:

View File

@ -11,49 +11,7 @@ from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint64_t from libc.stdint cimport uint64_t
from libcpp.vector cimport vector from libcpp.vector cimport vector
from spacy.spacy cimport StringHash
cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed,
int split, size_t length):
assert split <= length
cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
word.first = <Py_UNICODE>(string[0] if string else 0)
word.sic = hashed
cdef unicode tail_string
cdef unicode lex
if split != 0 and split < length:
lex = substr(string, 0, split, length)
tail_string = substr(string, split, length, length)
else:
lex = string
tail_string = ''
assert lex
#cdef unicode normed = normalize_word_string(lex)
cdef unicode normed = '?'
cdef unicode last3 = substr(string, length - 3, length, length)
assert normed
assert len(normed)
word.lex = lang.hash_string(lex, len(lex))
word.normed = lang.hash_string(normed, len(normed))
word.last3 = lang.hash_string(last3, len(last3))
lang.bacov[word.lex] = lex
lang.bacov[word.normed] = normed
lang.bacov[word.last3] = last3
# These are loaded later
word.prob = 0
word.cluster = 0
word.oft_upper = False
word.oft_title = False
# Now recurse, and deal with the tail
if tail_string:
word.tail = <Lexeme*>lang.lookup(-1, tail_string, len(tail_string))
return word
cpdef StringHash sic_of(size_t lex_id) except 0: cpdef StringHash sic_of(size_t lex_id) except 0:
@ -82,6 +40,20 @@ cpdef StringHash lex_of(size_t lex_id) except 0:
return (<Lexeme*>lex_id).lex return (<Lexeme*>lex_id).lex
cpdef StringHash norm_of(size_t lex_id) except 0:
'''Access the `lex' field of the Lexeme pointed to by lex_id.
The lex field is the hash of the string you would expect to get back from
a standard tokenizer, i.e. the word with punctuation and other non-whitespace
delimited tokens split off. The other fields refer to properties of the
string that the lex field stores a hash of, except sic and tail.
>>> [unhash(lex_of(lex_id) for lex_id in from_string(u'Hi! world')]
[u'Hi', u'!', u'world']
'''
return (<Lexeme*>lex_id).orth.norm
cpdef ClusterID cluster_of(size_t lex_id): cpdef ClusterID cluster_of(size_t lex_id):
'''Access the `cluster' field of the Lexeme pointed to by lex_id, which '''Access the `cluster' field of the Lexeme pointed to by lex_id, which
gives an integer representation of the cluster ID of the word, gives an integer representation of the cluster ID of the word,
@ -98,7 +70,7 @@ cpdef ClusterID cluster_of(size_t lex_id):
while "dapple" is totally different. On the other hand, "scalable" receives while "dapple" is totally different. On the other hand, "scalable" receives
the same cluster ID as "pineapple", which is not what we'd like. the same cluster ID as "pineapple", which is not what we'd like.
''' '''
return (<Lexeme*>lex_id).cluster return (<Lexeme*>lex_id).dist.cluster
cpdef Py_UNICODE first_of(size_t lex_id): cpdef Py_UNICODE first_of(size_t lex_id):
@ -109,7 +81,7 @@ cpdef Py_UNICODE first_of(size_t lex_id):
>>> unhash(first_of(lex_id)) >>> unhash(first_of(lex_id))
u'H' u'H'
''' '''
return (<Lexeme*>lex_id).first return (<Lexeme*>lex_id).orth.first
cpdef double prob_of(size_t lex_id): cpdef double prob_of(size_t lex_id):
@ -133,7 +105,8 @@ cpdef StringHash last3_of(size_t lex_id):
>>> [unhash(last3_of(lex_id)) for lex_id in lex_ids] >>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
[u'llo', u'!'] [u'llo', u'!']
''' '''
return (<Lexeme*>lex_id).last3 cdef Lexeme* w = <Lexeme*>lex_id
return w.orth.last3 if w.orth != NULL else 0
cpdef bint is_oft_upper(size_t lex_id): cpdef bint is_oft_upper(size_t lex_id):
@ -148,7 +121,12 @@ cpdef bint is_oft_upper(size_t lex_id):
>>> is_oft_upper(lookup(u'aBc')) # This must get the same answer >>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
True True
''' '''
return (<Lexeme*>lex_id).oft_upper return False
#cdef Lexeme* w = <Lexeme*>lex_id
#return w.orth.last3 if w.orth != NULL else 0
#return (<Lexeme*>lex_id).oft_upper
cpdef bint is_oft_title(size_t lex_id): cpdef bint is_oft_title(size_t lex_id):
@ -163,4 +141,5 @@ cpdef bint is_oft_title(size_t lex_id):
>>> is_oft_title(lookup(u'MARCUS')) # This must get the same value >>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
True True
''' '''
return (<Lexeme*>lex_id).oft_title return False
#return (<Lexeme*>lex_id).oft_title

View File

@ -7,16 +7,27 @@ from ext.sparsehash cimport dense_hash_map
# Circular import problems here # Circular import problems here
ctypedef size_t Lexeme_addr ctypedef size_t Lexeme_addr
ctypedef uint64_t StringHash ctypedef uint64_t StringHash
ctypedef dense_hash_map[StringHash, Lexeme_addr] Vocab ctypedef dense_hash_map[StringHash, size_t] Vocab
ctypedef int (*Splitter)(unicode word, size_t length) from spacy.lexeme cimport Lexeme
from spacy.tokens cimport Tokens
# Put these above import to avoid circular import problem
ctypedef char Bits8
ctypedef uint64_t Bits64
ctypedef int ClusterID
from spacy.lexeme cimport Lexeme from spacy.lexeme cimport Lexeme
from spacy.tokens cimport Tokens from spacy.lexeme cimport Distribution
from spacy.lexeme cimport Orthography
cdef class Language: cdef class Language:
cdef object name cdef object name
cdef Vocab* vocab cdef Vocab* vocab
cdef Vocab* distri
cdef Vocab* ortho
cdef dict bacov cdef dict bacov
cdef int find_split(self, unicode word, size_t length) cdef int find_split(self, unicode word, size_t length)
@ -26,3 +37,8 @@ cdef class Language:
cpdef Tokens tokenize(self, unicode text) cpdef Tokens tokenize(self, unicode text)
cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length) cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length)
cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed,
int split, size_t length)
cdef Orthography* init_orth(self, StringHash hashed, unicode lex)

View File

@ -6,22 +6,43 @@ from libc.stdlib cimport calloc, free
from ext.murmurhash cimport MurmurHash64A from ext.murmurhash cimport MurmurHash64A
from ext.murmurhash cimport MurmurHash64B from ext.murmurhash cimport MurmurHash64B
from spacy.lexeme cimport init_lexeme from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport BLANK_WORD from spacy.lexeme cimport BLANK_WORD
from spacy.string_tools cimport is_whitespace from spacy.string_tools cimport substr
from . import util from . import util
from os import path from os import path
cimport cython cimport cython
def get_normalized(unicode lex, size_t length):
return lex.lower()
#if lex.isdigit():
# return '!YEAR' if length == 4 else '!DIGIT'
#else:
# return lex.lower()
def get_word_shape(lex, length):
return lex
def set_orth_flags(lex, length):
return 0
cdef class Language: cdef class Language:
def __cinit__(self, name): def __cinit__(self, name):
self.name = name self.name = name
self.bacov = {} self.bacov = {}
self.vocab = new Vocab() self.vocab = new Vocab()
self.ortho = new Vocab()
self.distri = new Vocab()
self.vocab[0].set_empty_key(0) self.vocab[0].set_empty_key(0)
self.distri[0].set_empty_key(0)
self.ortho[0].set_empty_key(0)
self.load_tokenization(util.read_tokenization(name)) self.load_tokenization(util.read_tokenization(name))
def load_tokenization(self, token_rules=None): def load_tokenization(self, token_rules=None):
@ -80,7 +101,7 @@ cdef class Language:
return <Lexeme_addr>word_ptr return <Lexeme_addr>word_ptr
cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length): cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
word = init_lexeme(self, string, hashed, split, length) word = self.init_lexeme(string, hashed, split, length)
self.vocab[0][hashed] = <Lexeme_addr>word self.vocab[0][hashed] = <Lexeme_addr>word
self.bacov[hashed] = string self.bacov[hashed] = string
return word return word
@ -121,6 +142,55 @@ cdef class Language:
cdef int find_split(self, unicode word, size_t length): cdef int find_split(self, unicode word, size_t length):
return -1 return -1
cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed,
int split, size_t length):
cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
word.sic = hashed
cdef unicode tail_string
cdef unicode lex
if split != 0 and split < length:
lex = substr(string, 0, split, length)
tail_string = substr(string, split, length, length)
else:
lex = string
tail_string = ''
word.lex = self.hash_string(lex, len(lex))
self.bacov[word.lex] = lex
word.orth = <Orthography*>self.ortho[0][word.lex]
if word.orth == NULL:
word.orth = self.init_orth(word.lex, lex)
word.dist = <Distribution*>self.distri[0][word.lex]
# Now recurse, and deal with the tail
if tail_string:
word.tail = <Lexeme*>self.lookup(-1, tail_string, len(tail_string))
return word
cdef Orthography* init_orth(self, StringHash hashed, unicode lex):
cdef Orthography* orth = <Orthography*>calloc(1, sizeof(Orthography))
orth.first = <Py_UNICODE>lex[0]
cdef int length = len(lex)
orth.flags = set_orth_flags(lex, length)
cdef unicode last3 = substr(lex, length - 3, length, length)
cdef unicode norm = get_normalized(lex, length)
cdef unicode shape = get_word_shape(lex, length)
orth.last3 = self.hash_string(last3, len(last3))
orth.shape = self.hash_string(shape, len(shape))
orth.norm = self.hash_string(norm, len(norm))
self.bacov[orth.last3] = last3
self.bacov[orth.shape] = shape
self.bacov[orth.norm] = norm
self.ortho[0][hashed] = <size_t>orth
return orth
cdef inline bint _is_whitespace(Py_UNICODE c) nogil: cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
@ -137,7 +207,7 @@ cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
cpdef vector[size_t] expand_chunk(size_t addr) except *: cpdef vector[size_t] expand_chunk(size_t addr) except *:
cdef vector[size_t] tokens = vector[size_t]() cdef vector[size_t] tokens = vector[size_t]()
word = <Lexeme*>addr word = <Lexeme*>addr
while word is not NULL: while word != NULL:
tokens.push_back(<size_t>word) tokens.push_back(<size_t>word)
word = word.tail word = word.tail
return tokens return tokens

View File

@ -1,5 +1,6 @@
# cython: profile=True # cython: profile=True
cpdef unicode substr(unicode string, int start, int end, size_t length): cpdef unicode substr(unicode string, int start, int end, size_t length):
if end >= length: if end >= length:
end = -1 end = -1

View File

@ -1,6 +1,5 @@
from libcpp.vector cimport vector from libcpp.vector cimport vector
from spacy.lexeme cimport Lexeme from spacy.spacy cimport Lexeme_addr
from spacy.lexeme cimport Lexeme_addr
from cython.operator cimport dereference as deref from cython.operator cimport dereference as deref
from spacy.spacy cimport Language from spacy.spacy cimport Language

View File

@ -2,6 +2,11 @@ from cython.operator cimport dereference as deref
from cython.operator cimport preincrement as inc from cython.operator cimport preincrement as inc
from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport norm_of
from spacy.spacy cimport StringHash
cdef class Tokens: cdef class Tokens:
def __cinit__(self, Language lang): def __cinit__(self, Language lang):
self.lang = lang self.lang = lang
@ -38,11 +43,11 @@ cdef class Tokens:
cpdef dict count_by(self, Field attr): cpdef dict count_by(self, Field attr):
counts = {} counts = {}
cdef Lexeme_addr t cdef Lexeme_addr t
cdef Lexeme* word cdef StringHash key
for t in self.vctr[0]: for t in self.vctr[0]:
word = <Lexeme*>t key = norm_of(t)
if word.lex not in counts: if key not in counts:
counts[word.lex] = 0 counts[key] = 0
counts[word.lex] += 1 counts[key] += 1
return counts return counts