* Refactor for string view features. Working on setting up flags and enums.

This commit is contained in:
Matthew Honnibal 2014-07-07 16:58:48 +02:00
parent 9fd085bf90
commit 057c21969b
11 changed files with 167 additions and 79 deletions

View File

@ -1,8 +1,8 @@
from libcpp.vector cimport vector
from spacy.spacy cimport StringHash
from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport Lexeme_addr
from spacy.spacy cimport Lexeme
from spacy.spacy cimport Lexeme_addr
from spacy.spacy cimport Language
from spacy.tokens cimport Tokens

View File

@ -9,7 +9,6 @@ from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint64_t
from libcpp.vector cimport vector
from spacy.lexeme cimport Lexeme
from spacy.string_tools cimport substr
from . import util

View File

@ -2,8 +2,8 @@ from libcpp.vector cimport vector
from spacy.spacy cimport StringHash
from spacy.spacy cimport Language
from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport Lexeme_addr
from spacy.spacy cimport Lexeme
from spacy.spacy cimport Lexeme_addr
from spacy.tokens cimport Tokens

View File

@ -4,11 +4,11 @@ boldly assume no collisions.
'''
from __future__ import unicode_literals
from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint64_t
from libcpp.vector cimport vector
from spacy.lexeme cimport Lexeme
from spacy.string_tools cimport substr
from spacy.spacy cimport Language
from . import util

View File

@ -4,29 +4,48 @@ from libc.stdint cimport uint64_t
ctypedef int ClusterID
ctypedef uint64_t StringHash
ctypedef size_t Lexeme_addr
ctypedef char Bits8
ctypedef uint64_t Bits64
from spacy.spacy cimport Language
cdef struct Orthography:
StringHash last3
StringHash shape
StringHash norm
Py_UNICODE first
Bits8 flags
cdef struct Distribution:
double prob
ClusterID cluster
Bits64 tagdict
Bits8 flags
cdef struct Lexeme:
StringHash sic # Hash of the original string
StringHash lex # Hash of the word, with punctuation and clitics split off
StringHash normed # Hash of the normalized version of lex
StringHash last3 # Last 3 characters of the token
Py_UNICODE first # First character of the token
double prob # What is the log probability of the lex value?
ClusterID cluster # Brown cluster of the token
bint oft_upper # Is the lowered version of the lex value often in all caps?
bint oft_title # Is the lowered version of the lex value often title-cased?
Distribution* dist # Distribution info, lazy loaded
Orthography* orth # Extra orthographic views
Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens
cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL)
cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed,
int split, size_t length)
cpdef StringHash lex_of(size_t lex_id) except 0
cpdef StringHash norm_of(size_t lex_id) except 0
#cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed,
# int split, size_t length)
# Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which
# has a conditional to pick out the correct item. This allows safe iteration
# over the Lexeme, via:

View File

@ -11,49 +11,7 @@ from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint64_t
from libcpp.vector cimport vector
cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed,
int split, size_t length):
assert split <= length
cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
word.first = <Py_UNICODE>(string[0] if string else 0)
word.sic = hashed
cdef unicode tail_string
cdef unicode lex
if split != 0 and split < length:
lex = substr(string, 0, split, length)
tail_string = substr(string, split, length, length)
else:
lex = string
tail_string = ''
assert lex
#cdef unicode normed = normalize_word_string(lex)
cdef unicode normed = '?'
cdef unicode last3 = substr(string, length - 3, length, length)
assert normed
assert len(normed)
word.lex = lang.hash_string(lex, len(lex))
word.normed = lang.hash_string(normed, len(normed))
word.last3 = lang.hash_string(last3, len(last3))
lang.bacov[word.lex] = lex
lang.bacov[word.normed] = normed
lang.bacov[word.last3] = last3
# These are loaded later
word.prob = 0
word.cluster = 0
word.oft_upper = False
word.oft_title = False
# Now recurse, and deal with the tail
if tail_string:
word.tail = <Lexeme*>lang.lookup(-1, tail_string, len(tail_string))
return word
from spacy.spacy cimport StringHash
cpdef StringHash sic_of(size_t lex_id) except 0:
@ -82,6 +40,20 @@ cpdef StringHash lex_of(size_t lex_id) except 0:
return (<Lexeme*>lex_id).lex
cpdef StringHash norm_of(size_t lex_id) except 0:
'''Access the `lex' field of the Lexeme pointed to by lex_id.
The lex field is the hash of the string you would expect to get back from
a standard tokenizer, i.e. the word with punctuation and other non-whitespace
delimited tokens split off. The other fields refer to properties of the
string that the lex field stores a hash of, except sic and tail.
>>> [unhash(lex_of(lex_id) for lex_id in from_string(u'Hi! world')]
[u'Hi', u'!', u'world']
'''
return (<Lexeme*>lex_id).orth.norm
cpdef ClusterID cluster_of(size_t lex_id):
'''Access the `cluster' field of the Lexeme pointed to by lex_id, which
gives an integer representation of the cluster ID of the word,
@ -98,7 +70,7 @@ cpdef ClusterID cluster_of(size_t lex_id):
while "dapple" is totally different. On the other hand, "scalable" receives
the same cluster ID as "pineapple", which is not what we'd like.
'''
return (<Lexeme*>lex_id).cluster
return (<Lexeme*>lex_id).dist.cluster
cpdef Py_UNICODE first_of(size_t lex_id):
@ -109,7 +81,7 @@ cpdef Py_UNICODE first_of(size_t lex_id):
>>> unhash(first_of(lex_id))
u'H'
'''
return (<Lexeme*>lex_id).first
return (<Lexeme*>lex_id).orth.first
cpdef double prob_of(size_t lex_id):
@ -133,7 +105,8 @@ cpdef StringHash last3_of(size_t lex_id):
>>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
[u'llo', u'!']
'''
return (<Lexeme*>lex_id).last3
cdef Lexeme* w = <Lexeme*>lex_id
return w.orth.last3 if w.orth != NULL else 0
cpdef bint is_oft_upper(size_t lex_id):
@ -148,7 +121,12 @@ cpdef bint is_oft_upper(size_t lex_id):
>>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
True
'''
return (<Lexeme*>lex_id).oft_upper
return False
#cdef Lexeme* w = <Lexeme*>lex_id
#return w.orth.last3 if w.orth != NULL else 0
#return (<Lexeme*>lex_id).oft_upper
cpdef bint is_oft_title(size_t lex_id):
@ -163,4 +141,5 @@ cpdef bint is_oft_title(size_t lex_id):
>>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
True
'''
return (<Lexeme*>lex_id).oft_title
return False
#return (<Lexeme*>lex_id).oft_title

View File

@ -7,16 +7,27 @@ from ext.sparsehash cimport dense_hash_map
# Circular import problems here
ctypedef size_t Lexeme_addr
ctypedef uint64_t StringHash
ctypedef dense_hash_map[StringHash, Lexeme_addr] Vocab
ctypedef int (*Splitter)(unicode word, size_t length)
ctypedef dense_hash_map[StringHash, size_t] Vocab
from spacy.lexeme cimport Lexeme
from spacy.tokens cimport Tokens
# Put these above import to avoid circular import problem
ctypedef char Bits8
ctypedef uint64_t Bits64
ctypedef int ClusterID
from spacy.lexeme cimport Lexeme
from spacy.tokens cimport Tokens
from spacy.lexeme cimport Distribution
from spacy.lexeme cimport Orthography
cdef class Language:
cdef object name
cdef Vocab* vocab
cdef Vocab* distri
cdef Vocab* ortho
cdef dict bacov
cdef int find_split(self, unicode word, size_t length)
@ -26,3 +37,8 @@ cdef class Language:
cpdef Tokens tokenize(self, unicode text)
cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length)
cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed,
int split, size_t length)
cdef Orthography* init_orth(self, StringHash hashed, unicode lex)

View File

@ -6,22 +6,43 @@ from libc.stdlib cimport calloc, free
from ext.murmurhash cimport MurmurHash64A
from ext.murmurhash cimport MurmurHash64B
from spacy.lexeme cimport init_lexeme
from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport BLANK_WORD
from spacy.string_tools cimport is_whitespace
from spacy.string_tools cimport substr
from . import util
from os import path
cimport cython
def get_normalized(unicode lex, size_t length):
return lex.lower()
#if lex.isdigit():
# return '!YEAR' if length == 4 else '!DIGIT'
#else:
# return lex.lower()
def get_word_shape(lex, length):
return lex
def set_orth_flags(lex, length):
return 0
cdef class Language:
def __cinit__(self, name):
self.name = name
self.bacov = {}
self.vocab = new Vocab()
self.ortho = new Vocab()
self.distri = new Vocab()
self.vocab[0].set_empty_key(0)
self.distri[0].set_empty_key(0)
self.ortho[0].set_empty_key(0)
self.load_tokenization(util.read_tokenization(name))
def load_tokenization(self, token_rules=None):
@ -80,7 +101,7 @@ cdef class Language:
return <Lexeme_addr>word_ptr
cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
word = init_lexeme(self, string, hashed, split, length)
word = self.init_lexeme(string, hashed, split, length)
self.vocab[0][hashed] = <Lexeme_addr>word
self.bacov[hashed] = string
return word
@ -121,6 +142,55 @@ cdef class Language:
cdef int find_split(self, unicode word, size_t length):
return -1
cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed,
int split, size_t length):
cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
word.sic = hashed
cdef unicode tail_string
cdef unicode lex
if split != 0 and split < length:
lex = substr(string, 0, split, length)
tail_string = substr(string, split, length, length)
else:
lex = string
tail_string = ''
word.lex = self.hash_string(lex, len(lex))
self.bacov[word.lex] = lex
word.orth = <Orthography*>self.ortho[0][word.lex]
if word.orth == NULL:
word.orth = self.init_orth(word.lex, lex)
word.dist = <Distribution*>self.distri[0][word.lex]
# Now recurse, and deal with the tail
if tail_string:
word.tail = <Lexeme*>self.lookup(-1, tail_string, len(tail_string))
return word
cdef Orthography* init_orth(self, StringHash hashed, unicode lex):
cdef Orthography* orth = <Orthography*>calloc(1, sizeof(Orthography))
orth.first = <Py_UNICODE>lex[0]
cdef int length = len(lex)
orth.flags = set_orth_flags(lex, length)
cdef unicode last3 = substr(lex, length - 3, length, length)
cdef unicode norm = get_normalized(lex, length)
cdef unicode shape = get_word_shape(lex, length)
orth.last3 = self.hash_string(last3, len(last3))
orth.shape = self.hash_string(shape, len(shape))
orth.norm = self.hash_string(norm, len(norm))
self.bacov[orth.last3] = last3
self.bacov[orth.shape] = shape
self.bacov[orth.norm] = norm
self.ortho[0][hashed] = <size_t>orth
return orth
cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
@ -137,7 +207,7 @@ cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
cpdef vector[size_t] expand_chunk(size_t addr) except *:
cdef vector[size_t] tokens = vector[size_t]()
word = <Lexeme*>addr
while word is not NULL:
while word != NULL:
tokens.push_back(<size_t>word)
word = word.tail
return tokens

View File

@ -1,5 +1,6 @@
# cython: profile=True
cpdef unicode substr(unicode string, int start, int end, size_t length):
if end >= length:
end = -1

View File

@ -1,6 +1,5 @@
from libcpp.vector cimport vector
from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport Lexeme_addr
from spacy.spacy cimport Lexeme_addr
from cython.operator cimport dereference as deref
from spacy.spacy cimport Language

View File

@ -2,6 +2,11 @@ from cython.operator cimport dereference as deref
from cython.operator cimport preincrement as inc
from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport norm_of
from spacy.spacy cimport StringHash
cdef class Tokens:
def __cinit__(self, Language lang):
self.lang = lang
@ -38,11 +43,11 @@ cdef class Tokens:
cpdef dict count_by(self, Field attr):
counts = {}
cdef Lexeme_addr t
cdef Lexeme* word
cdef StringHash key
for t in self.vctr[0]:
word = <Lexeme*>t
if word.lex not in counts:
counts[word.lex] = 0
counts[word.lex] += 1
key = norm_of(t)
if key not in counts:
counts[key] = 0
counts[key] += 1
return counts