* Working refactor, with updated data model for Lexemes

This commit is contained in:
Matthew Honnibal 2014-08-19 04:21:20 +02:00
parent 3379d7a571
commit 5fddb8d165
5 changed files with 72 additions and 88 deletions

View File

@ -66,31 +66,3 @@ cpdef unicode unhash(StringHash hash_value):
return EN.unhash(hash_value) return EN.unhash(hash_value)
cpdef bint is_oft_upper(size_t lex_id):
'''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
stores whether the lowered version of the string hashed by `lex' is found
in all-upper case frequently in a large sample of text. Users are free
to load different data, by default we use a sample from Wikipedia, with
a threshold of 0.95, picked to maximize mutual information for POS tagging.
>>> is_oft_upper(lookup(u'abc'))
True
>>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
True
'''
return (<Lexeme*>lex_id).dist.flags & OFT_UPPER
cpdef bint is_oft_title(size_t lex_id):
'''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
stores whether the lowered version of the string hashed by `lex' is found
title-cased frequently in a large sample of text. Users are free
to load different data, by default we use a sample from Wikipedia, with
a threshold of 0.3, picked to maximize mutual information for POS tagging.
>>> is_oft_title(lookup(u'marcus'))
True
>>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
True
'''
return (<Lexeme*>lex_id).dist.flags & OFT_TITLE

View File

@ -32,12 +32,9 @@ cdef enum DistFlag:
cdef struct Orthography: cdef struct Orthography:
StringHash last3
StringHash shape StringHash shape
StringHash norm StringHash norm
StringHash last3
size_t length
Py_UNICODE first
Bits8 flags Bits8 flags
@ -49,12 +46,17 @@ cdef struct Distribution:
cdef struct Lexeme: cdef struct Lexeme:
StringHash lex # Hash of the word char* string
Orthography* orth # Extra orthographic views size_t length
Distribution* dist # Distribution info StringHash lex
Orthography orth # Extra orthographic views
Distribution dist # Distribution info
cdef Lexeme BLANK_WORD = Lexeme(0, NULL, NULL) cdef Lexeme BLANK_WORD = Lexeme(NULL, 0, 0,
Orthography(0, 0, 0, 0),
Distribution(0.0, 0, 0, 0)
)
cdef enum StringAttr: cdef enum StringAttr:
@ -68,13 +70,11 @@ cdef enum StringAttr:
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0 cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0
cpdef StringHash lex_of(size_t lex_id) except 0 cpdef StringHash lex_of(size_t lex_id) except 0
cpdef StringHash norm_of(size_t lex_id) except 0 cpdef StringHash norm_of(size_t lex_id) except 0
cpdef StringHash shape_of(size_t lex_id) except 0 cpdef StringHash shape_of(size_t lex_id) except 0
cpdef StringHash last3_of(size_t lex_id) except 0 cpdef StringHash last3_of(size_t lex_id) except 0
cpdef size_t length_of(size_t lex_id) except * cpdef size_t length_of(size_t lex_id) except *
cpdef Py_UNICODE first_of(size_t lex_id) except *
cpdef double prob_of(size_t lex_id) except 0 cpdef double prob_of(size_t lex_id) except 0
cpdef ClusterID cluster_of(size_t lex_id) except 0 cpdef ClusterID cluster_of(size_t lex_id) except 0

View File

@ -72,7 +72,7 @@ cpdef StringHash last3_of(size_t lex_id) except 0:
return (<Lexeme*>lex_id).orth.last3 return (<Lexeme*>lex_id).orth.last3
cpdef ClusterID cluster_of(size_t lex_id): cpdef ClusterID cluster_of(size_t lex_id) except 0:
'''Access the `cluster' field of the Lexeme pointed to by lex_id, which '''Access the `cluster' field of the Lexeme pointed to by lex_id, which
gives an integer representation of the cluster ID of the word, gives an integer representation of the cluster ID of the word,
which should be understood as a binary address: which should be understood as a binary address:
@ -99,21 +99,17 @@ cpdef Py_UNICODE first_of(size_t lex_id):
>>> unhash(first_of(lex_id)) >>> unhash(first_of(lex_id))
u'H' u'H'
''' '''
if (<Lexeme*>lex_id).orth == NULL:
return 0
return (<Lexeme*>lex_id).orth.first return (<Lexeme*>lex_id).orth.first
cpdef StringHash length_of(size_t lex_id): cpdef size_t length_of(size_t lex_id) except *:
'''Access the `length' field of the Lexeme pointed to by lex_id, which stores '''Access the `length' field of the Lexeme pointed to by lex_id, which stores
the length of the string hashed by lex_of.''' the length of the string hashed by lex_of.'''
cdef Lexeme* word = <Lexeme*>lex_id cdef Lexeme* word = <Lexeme*>lex_id
if (<Lexeme*>lex_id).orth == NULL: return word.length
return 0
return (<Lexeme*>lex_id).orth.length
cpdef double prob_of(size_t lex_id): cpdef double prob_of(size_t lex_id) except 0:
'''Access the `prob' field of the Lexeme pointed to by lex_id, which stores '''Access the `prob' field of the Lexeme pointed to by lex_id, which stores
the smoothed unigram log probability of the word, as estimated from a large the smoothed unigram log probability of the word, as estimated from a large
text corpus. By default, probabilities are based on counts from Gigaword, text corpus. By default, probabilities are based on counts from Gigaword,
@ -126,9 +122,38 @@ cpdef double prob_of(size_t lex_id):
return (<Lexeme*>lex_id).dist.prob return (<Lexeme*>lex_id).dist.prob
cpdef bint check_orth_flag(size_t lex, OrthFlag flag) except *: cpdef bint is_oft_upper(size_t lex_id):
'''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
stores whether the lowered version of the string hashed by `lex' is found
in all-upper case frequently in a large sample of text. Users are free
to load different data, by default we use a sample from Wikipedia, with
a threshold of 0.95, picked to maximize mutual information for POS tagging.
>>> is_oft_upper(lookup(u'abc'))
True
>>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
True
'''
return (<Lexeme*>lex_id).dist.flags & OFT_UPPER
cpdef bint is_oft_title(size_t lex_id):
'''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
stores whether the lowered version of the string hashed by `lex' is found
title-cased frequently in a large sample of text. Users are free
to load different data, by default we use a sample from Wikipedia, with
a threshold of 0.3, picked to maximize mutual information for POS tagging.
>>> is_oft_title(lookup(u'marcus'))
True
>>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
True
'''
return (<Lexeme*>lex_id).dist.flags & OFT_TITLE
cpdef bint check_orth_flag(size_t lex_id, OrthFlag flag) except *:
return (<Lexeme*>lex_id).orth.flags & (1 << flag) return (<Lexeme*>lex_id).orth.flags & (1 << flag)
cpdef bint check_dist_flag(size_t lex, DistFlag flag) except *: cpdef bint check_dist_flag(size_t lex_id, DistFlag flag) except *:
return (<Lexeme*>lex_id).dist.flags & (1 << flag) return (<Lexeme*>lex_id).dist.flags & (1 << flag)

View File

@ -21,7 +21,6 @@ ctypedef int ClusterID
from spacy.lexeme cimport Lexeme from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport Distribution from spacy.lexeme cimport Distribution
from spacy.lexeme cimport Orthography from spacy.lexeme cimport Orthography
from spacy._hashing cimport WordTree
cdef class Language: cdef class Language:
@ -37,8 +36,6 @@ cdef class Language:
cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL
cdef Lexeme* new_lexeme(self, unicode lex) except NULL cdef Lexeme* new_lexeme(self, unicode lex) except NULL
cdef Orthography* new_orth(self, unicode lex) except NULL
cdef Distribution* new_dist(self, unicode lex) except NULL
cdef unicode unhash(self, StringHash hashed) cdef unicode unhash(self, StringHash hashed)

View File

@ -13,15 +13,19 @@ from spacy.string_tools cimport substr
from . import util from . import util
from os import path from os import path
DIST_FLAGS = {}
TAGS = {}
def get_normalized(unicode lex, size_t length):
def get_normalized(unicode lex):
if lex.isalpha() and lex.islower(): if lex.isalpha() and lex.islower():
return lex return lex
else: else:
return get_word_shape(lex, length) return get_word_shape(lex)
def get_word_shape(unicode lex, length): def get_word_shape(unicode lex):
cdef size_t length = len(lex)
shape = "" shape = ""
last = "" last = ""
shape_char = "" shape_char = ""
@ -47,7 +51,7 @@ def get_word_shape(unicode lex, length):
return shape return shape
def set_orth_flags(lex, length): def set_orth_flags(lex):
return 0 return 0
@ -60,7 +64,7 @@ cdef class Language:
self.chunks.set_empty_key(0) self.chunks.set_empty_key(0)
self.vocab.set_empty_key(0) self.vocab.set_empty_key(0)
self.load_tokenization(util.read_tokenization(name)) self.load_tokenization(util.read_tokenization(name))
self.load_dist_info(util.read_dist_info(name)) #self.load_dist_info(util.read_dist_info(name))
cdef Tokens tokenize(self, unicode string): cdef Tokens tokenize(self, unicode string):
cdef Lexeme** chunk cdef Lexeme** chunk
@ -106,39 +110,25 @@ cdef class Language:
cdef Lexeme* new_lexeme(self, unicode string) except NULL: cdef Lexeme* new_lexeme(self, unicode string) except NULL:
cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme)) cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
cdef bytes byte_string = string.encode('utf8')
word.string = <char*>byte_string
word.length = len(byte_string)
word.orth.flags = set_orth_flags(string)
cdef unicode norm = get_normalized(string)
cdef unicode shape = get_word_shape(string)
cdef unicode last3 = string[-3:]
word.lex = hash(string) word.lex = hash(string)
word.orth.norm = hash(norm)
word.orth.shape = hash(shape)
word.orth.last3 = hash(last3)
self.bacov[word.lex] = string self.bacov[word.lex] = string
word.orth = self.new_orth(string) self.bacov[word.orth.norm] = norm
self.bacov[word.orth.shape] = shape
self.bacov[word.orth.last3] = last3
word.dist = <Distribution*>calloc(1, sizeof(Distribution)) self.vocab[hash(string)] = <size_t>word
self.vocab[word.lex] = <size_t>word
return word return word
cdef Orthography* new_orth(self, unicode lex) except NULL:
cdef unicode last3
cdef unicode norm
cdef unicode shape
cdef int length
length = len(lex)
orth = <Orthography*>calloc(1, sizeof(Orthography))
orth.first = lex[0]
orth.length = length
orth.flags = set_orth_flags(lex, orth.length)
orth.norm = hash(lex)
last3 = substr(lex, length - 3, length, length)
orth.last3 = hash(last3)
norm = get_normalized(lex, length)
orth.norm = hash(norm)
shape = get_word_shape(lex, length)
orth.shape = hash(shape)
self.bacov[orth.last3] = last3
self.bacov[orth.norm] = norm
self.bacov[orth.shape] = shape
return orth
cdef unicode unhash(self, StringHash hash_value): cdef unicode unhash(self, StringHash hash_value):
'''Fetch a string from the reverse index, given its hash value.''' '''Fetch a string from the reverse index, given its hash value.'''
return self.bacov[hash_value] return self.bacov[hash_value]
@ -167,12 +157,12 @@ cdef class Language:
cdef Lexeme* w cdef Lexeme* w
for string, word_dist in dist_info.items(): for string, word_dist in dist_info.items():
w = self.lookup(string) w = self.lookup(string)
w.prob = word_dist.prob w.dist.prob = word_dist.prob
w.cluster = word_dist.cluster w.dist.cluster = word_dist.cluster
for flag in word_dist.flags: for flag in word_dist.flags:
w.flags |= lexeme.DIST_FLAGS[flag] w.dist.flags |= DIST_FLAGS[flag]
for tag in word_dist.tagdict: for tag in word_dist.tagdict:
w.tagdict |= lexeme.TAGS[tag] w.dist.tagdict |= TAGS[tag]
cdef inline bint _is_whitespace(Py_UNICODE c) nogil: cdef inline bint _is_whitespace(Py_UNICODE c) nogil: