* Reforming data model for lexemes

This commit is contained in:
Matthew Honnibal 2014-08-19 02:40:37 +02:00
parent e091b6a241
commit 3379d7a571
5 changed files with 90 additions and 67 deletions

View File

@ -64,3 +64,33 @@ cpdef Lexeme_addr lookup(unicode string) except 0:
cpdef unicode unhash(StringHash hash_value):
return EN.unhash(hash_value)
cpdef bint is_oft_upper(size_t lex_id):
'''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
stores whether the lowered version of the string hashed by `lex' is found
in all-upper case frequently in a large sample of text. Users are free
to load different data, by default we use a sample from Wikipedia, with
a threshold of 0.95, picked to maximize mutual information for POS tagging.
>>> is_oft_upper(lookup(u'abc'))
True
>>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
True
'''
return (<Lexeme*>lex_id).dist.flags & OFT_UPPER
cpdef bint is_oft_title(size_t lex_id):
'''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
stores whether the lowered version of the string hashed by `lex' is found
title-cased frequently in a large sample of text. Users are free
to load different data, by default we use a sample from Wikipedia, with
a threshold of 0.3, picked to maximize mutual information for POS tagging.
>>> is_oft_title(lookup(u'marcus'))
True
>>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
True
'''
return (<Lexeme*>lex_id).dist.flags & OFT_TITLE

View File

@ -9,13 +9,35 @@ ctypedef char Bits8
ctypedef uint64_t Bits64
cdef enum OrthFlag:
IS_ALPHA
IS_DIGIT
IS_PUNCT
IS_WHITE
IS_LOWER
IS_UPPER
IS_TITLE
IS_ASCII
cdef enum DistFlag:
OFT_UPPER
OFT_TITLE
DIST_FLAG3
DIST_FLAG4
DIST_FLAG5
DIST_FLAG6
DIST_FLAG7
DIST_FLAG8
cdef struct Orthography:
StringHash last3
StringHash shape
StringHash norm
size_t length
unsigned char first
Py_UNICODE first
Bits8 flags
@ -27,15 +49,12 @@ cdef struct Distribution:
cdef struct Lexeme:
StringHash sic # Hash of the original string
StringHash lex # Hash of the word, with punctuation and clitics split off
Distribution* dist # Distribution info, lazy loaded
StringHash lex # Hash of the word
Orthography* orth # Extra orthographic views
#Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens
Distribution* dist # Distribution info
cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL)
cdef Lexeme BLANK_WORD = Lexeme(0, NULL, NULL)
cdef enum StringAttr:
@ -49,7 +68,16 @@ cdef enum StringAttr:
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0
cpdef StringHash lex_of(size_t lex_id) except 0
cpdef StringHash norm_of(size_t lex_id) except 0
cpdef StringHash shape_of(size_t lex_id) except 0
cpdef StringHash last3_of(size_t lex_id) except 0
cpdef StringHash length_of(size_t lex_id)
cpdef size_t length_of(size_t lex_id) except *
cpdef Py_UNICODE first_of(size_t lex_id) except *
cpdef double prob_of(size_t lex_id) except 0
cpdef ClusterID cluster_of(size_t lex_id) except 0
cpdef bint check_orth_flag(size_t lex, OrthFlag flag) except *
cpdef bint check_dist_flag(size_t lex, DistFlag flag) except *

View File

@ -13,13 +13,6 @@ from libcpp.vector cimport vector
from spacy.spacy cimport StringHash
# Reiterate the enum, for python
#SIC = StringAttr.sic
#LEX = StringAttr.lex
#NORM = StringAttr.norm
#SHAPE = StringAttr.shape
#LAST3 = StringAttr.last3
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0:
if attr == LEX:
@ -133,37 +126,9 @@ cpdef double prob_of(size_t lex_id):
return (<Lexeme*>lex_id).dist.prob
cpdef bint is_oft_upper(size_t lex_id):
'''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
stores whether the lowered version of the string hashed by `lex' is found
in all-upper case frequently in a large sample of text. Users are free
to load different data, by default we use a sample from Wikipedia, with
a threshold of 0.95, picked to maximize mutual information for POS tagging.
>>> is_oft_upper(lookup(u'abc'))
True
>>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
True
'''
return False
#cdef Lexeme* w = <Lexeme*>lex_id
#return w.orth.last3 if w.orth != NULL else 0
cpdef bint check_orth_flag(size_t lex, OrthFlag flag) except *:
return (<Lexeme*>lex_id).orth.flags & (1 << flag)
#return (<Lexeme*>lex_id).oft_upper
cpdef bint is_oft_title(size_t lex_id):
'''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
stores whether the lowered version of the string hashed by `lex' is found
title-cased frequently in a large sample of text. Users are free
to load different data, by default we use a sample from Wikipedia, with
a threshold of 0.3, picked to maximize mutual information for POS tagging.
>>> is_oft_title(lookup(u'marcus'))
True
>>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
True
'''
return False
#return (<Lexeme*>lex_id).oft_title
cpdef bint check_dist_flag(size_t lex, DistFlag flag) except *:
return (<Lexeme*>lex_id).dist.flags & (1 << flag)

View File

@ -60,6 +60,7 @@ cdef class Language:
self.chunks.set_empty_key(0)
self.vocab.set_empty_key(0)
self.load_tokenization(util.read_tokenization(name))
self.load_dist_info(util.read_dist_info(name))
cdef Tokens tokenize(self, unicode string):
cdef Lexeme** chunk
@ -108,7 +109,8 @@ cdef class Language:
word.lex = hash(string)
self.bacov[word.lex] = string
word.orth = self.new_orth(string)
word.dist = self.new_dist(string)
word.dist = <Distribution*>calloc(1, sizeof(Distribution))
self.vocab[word.lex] = <size_t>word
return word
@ -135,13 +137,8 @@ cdef class Language:
self.bacov[orth.last3] = last3
self.bacov[orth.norm] = norm
self.bacov[orth.shape] = shape
return orth
cdef Distribution* new_dist(self, unicode lex) except NULL:
dist = <Distribution*>calloc(1, sizeof(Distribution))
return dist
cdef unicode unhash(self, StringHash hash_value):
'''Fetch a string from the reverse index, given its hash value.'''
return self.bacov[hash_value]
@ -164,21 +161,18 @@ cdef class Language:
for chunk, tokens in token_rules:
self.new_chunk(chunk, tokens)
def load_clusters(self):
def load_dist_info(self, dist_info):
cdef unicode string
cdef dict word_dist
cdef Lexeme* w
data_dir = path.join(path.dirname(__file__), '..', 'data', 'en')
case_stats = util.load_case_stats(data_dir)
brown_loc = path.join(data_dir, 'clusters')
cdef size_t start
cdef int end
with util.utf8open(brown_loc) as browns_file:
for i, line in enumerate(browns_file):
cluster_str, token_string, freq_str = line.split()
# Decode as a little-endian string, so that we can do & 15 to get
# the first 4 bits. See redshift._parse_features.pyx
cluster = int(cluster_str[::-1], 2)
upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0))
self.new_lexeme(token_string)
for string, word_dist in dist_info.items():
w = self.lookup(string)
w.prob = word_dist.prob
w.cluster = word_dist.cluster
for flag in word_dist.flags:
w.flags |= lexeme.DIST_FLAGS[flag]
for tag in word_dist.tagdict:
w.tagdict |= lexeme.TAGS[tag]
cdef inline bint _is_whitespace(Py_UNICODE c) nogil:

View File

@ -19,6 +19,12 @@ def load_case_stats(data_dir):
return case_stats
def load_dist_info(lang):
with path.join(DATA_DIR, lang, 'distribution_info.json') as file_:
dist_info = json.load(file_)
return dist_info
def read_tokenization(lang):
loc = path.join(DATA_DIR, lang, 'tokenization')
entries = []