* Tmp. Working on refactor. Compiles, must hook up lexical feats.

This commit is contained in:
Matthew Honnibal 2015-01-14 00:03:48 +11:00
parent 46da3d74d2
commit 0930892fc1
9 changed files with 150 additions and 196 deletions

View File

@ -12,7 +12,10 @@ from .attrs import get_flags
def get_lex_props(string): def get_lex_props(string):
return {'flags': get_flags(string), 'dense': 1} return {'flags': get_flags(string), 'length': len(string),
'sic': string, 'norm1': string, 'norm2': string, 'shape': string,
'prefix': string[0], 'suffix': string[-3:], 'cluster': 0, 'prob': 0,
'sentiment': 0}
LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data') LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')
@ -45,7 +48,7 @@ class English(object):
""" """
def __init__(self, data_dir=LOCAL_DATA_DIR): def __init__(self, data_dir=LOCAL_DATA_DIR):
self._data_dir = data_dir self._data_dir = data_dir
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab'), self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
get_lex_props=get_lex_props) get_lex_props=get_lex_props)
tag_names = list(POS_TAGS.keys()) tag_names = list(POS_TAGS.keys())
tag_names.sort() tag_names.sort()

View File

@ -283,12 +283,12 @@ cdef class EnPosTagger:
cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1: cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1:
if self.lemmatizer is None: if self.lemmatizer is None:
return lex.sic return lex.sic
cdef bytes py_string = self.strings[lex.sic] cdef unicode py_string = self.strings[lex.sic]
if pos != NOUN and pos != VERB and pos != ADJ: if pos != NOUN and pos != VERB and pos != ADJ:
return lex.sic return lex.sic
cdef set lemma_strings cdef set lemma_strings
cdef unicode lemma_string cdef unicode lemma_string
lemma_strings = self.lemmatizer(py_string.decode('utf8'), pos) lemma_strings = self.lemmatizer(py_string, pos)
lemma_string = sorted(lemma_strings)[0] lemma_string = sorted(lemma_strings)[0]
lemma = self.strings.intern(lemma_string.encode('utf8'), len(lemma_string)).i lemma = self.strings.intern(lemma_string.encode('utf8'), len(lemma_string)).i
return lemma return lemma

View File

@ -7,10 +7,8 @@ from .strings cimport StringStore
cdef LexemeC EMPTY_LEXEME cdef LexemeC EMPTY_LEXEME
cdef LexemeC init(id_t i, unicode string, hash_t hashed, StringStore store, cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore strings) except -1
dict props) except *
cdef class Lexeme: cdef class Lexeme:
cdef const float* vec cdef const float* vec

View File

@ -5,27 +5,27 @@ from murmurhash.mrmr cimport hash64
from libc.string cimport memset from libc.string cimport memset
from .orth cimport word_shape from .orth cimport word_shape
from .typedefs cimport attr_t
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
cdef LexemeC init(id_t i, unicode string, hash_t hashed, cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store) except -1:
StringStore string_store, dict props) except *:
cdef LexemeC lex
lex.id = i
lex.length = len(string)
lex.sic = string_store[string]
lex.cluster = props.get('cluster', 0)
lex.prob = props.get('prob', 0)
lex.prefix = string_store[string[:1]] lex.length = props['length']
lex.suffix = string_store[string[-3:]] lex.sic = string_store[props['sic']]
lex.shape = string_store[word_shape(string)] lex.norm1 = string_store[props['norm1']]
lex.norm2 = string_store[props['norm2']]
lex.flags = props.get('flags', 0) lex.shape = string_store[props['shape']]
return lex lex.prefix = string_store[props['prefix']]
lex.suffix = string_store[props['suffix']]
lex.cluster = props['cluster']
lex.prob = props['prob']
lex.sentiment = props['sentiment']
lex.flags = props['flags']
cdef class Lexeme: cdef class Lexeme:

View File

@ -67,7 +67,7 @@ cdef class StringStore:
if string_or_id < 1 or string_or_id >= self.size: if string_or_id < 1 or string_or_id >= self.size:
raise IndexError(string_or_id) raise IndexError(string_or_id)
utf8str = &self.strings[<int>string_or_id] utf8str = &self.strings[<int>string_or_id]
return utf8str.chars[:utf8str.length] return utf8str.chars[:utf8str.length].decode('utf8')
elif isinstance(string_or_id, bytes): elif isinstance(string_or_id, bytes):
utf8str = self.intern(<char*>string_or_id, len(string_or_id)) utf8str = self.intern(<char*>string_or_id, len(string_or_id))
return utf8str.i return utf8str.i

View File

@ -42,32 +42,5 @@ cdef class Tokens:
cdef class Token: cdef class Token:
cdef cvarray vec cdef readonly Tokens _seq
cdef readonly int i
cdef readonly flags_t flags
cdef readonly attr_t id
cdef readonly attr_t sic
cdef readonly attr_t dense
cdef readonly attr_t shape
cdef readonly attr_t prefix
cdef readonly attr_t suffix
cdef readonly attr_t length
cdef readonly attr_t cluster
cdef readonly attr_t pos_type
cdef readonly float prob
cdef readonly float sentiment
cdef readonly Morphology morph
cdef readonly univ_tag_t pos
cdef readonly int fine_pos
cdef readonly int idx
cdef readonly int lemma
cdef readonly int sense
cdef readonly int dep_tag
cdef readonly int head_offset
cdef readonly uint32_t l_kids
cdef readonly uint32_t r_kids

View File

@ -85,7 +85,7 @@ cdef class Tokens:
token (Token): token (Token):
""" """
bounds_check(i, self.length, PADDING) bounds_check(i, self.length, PADDING)
return cinit_token(&self.data[i]) return Token(self, i)
def __iter__(self): def __iter__(self):
"""Iterate over the tokens. """Iterate over the tokens.
@ -174,38 +174,26 @@ cdef class Tokens:
self.data[i].lex = &EMPTY_LEXEME self.data[i].lex = &EMPTY_LEXEME
cdef Token cinit_token(const TokenC* c_tok): @cython.freelist(64)
cdef Token py_tok = Token.__new__(Token)
py_tok.morph = c_tok.morph
py_tok.pos = c_tok.pos
py_tok.fine_pos = c_tok.fine_pos
py_tok.idx = c_tok.idx
py_tok.lemma = c_tok.lemma
py_tok.sense = c_tok.sense
py_tok.dep_tag = c_tok.dep_tag
py_tok.head_offset = c_tok.head
py_tok.l_kids = c_tok.l_kids
py_tok.r_kids = c_tok.r_kids
return py_tok
cdef class Token: cdef class Token:
"""An individual token. """An individual token.
"""
def __init__(self):
pass
#self._seq = tokens
#self.i = i
#def __unicode__(self): Internally, the Token is a tuple (i, tokens) --- it delegates to the Tokens
# cdef const TokenC* t = &self._seq.data[self.i] object.
# cdef int end_idx = t.idx + t.lex.length """
# if self.i + 1 == self._seq.length: def __init__(self, Tokens tokens, int i):
# return self.string self._seq = tokens
# if end_idx == t[1].idx: self.i = i
# return self.string
# else: def __unicode__(self):
# return self.string + ' ' cdef const TokenC* t = &self._seq.data[self.i]
cdef int end_idx = t.idx + t.lex.length
if self.i + 1 == self._seq.length:
return self.string
if end_idx == t[1].idx:
return self.string
else:
return self.string + ' '
def __len__(self): def __len__(self):
"""The number of unicode code-points in the original string. """The number of unicode code-points in the original string.
@ -213,87 +201,87 @@ cdef class Token:
Returns: Returns:
length (int): length (int):
""" """
return self.length return self._seq.data[self.i].lex.length
#property idx: property idx:
# """The index into the original string at which the token starts. """The index into the original string at which the token starts.
# The following is supposed to always be true: The following is supposed to always be true:
#
# >>> original_string[token.idx:token.idx len(token) == token.string >>> original_string[token.idx:token.idx len(token) == token.string
# """ """
# def __get__(self): def __get__(self):
# return self._seq.data[self.i].idx return self._seq.data[self.i].idx
#property cluster: property cluster:
# """The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering """The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering
#
# Similar words have better-than-chance likelihood of having similar cluster Similar words have better-than-chance likelihood of having similar cluster
# IDs, although the clustering is quite noisy. Cluster IDs make good features, IDs, although the clustering is quite noisy. Cluster IDs make good features,
# and help to make models slightly more robust to domain variation. and help to make models slightly more robust to domain variation.
# A common trick is to use only the first N bits of a cluster ID in a feature, A common trick is to use only the first N bits of a cluster ID in a feature,
# as the more general part of the hierarchical clustering is often more accurate as the more general part of the hierarchical clustering is often more accurate
# than the lower categories. than the lower categories.
# To assist in this, I encode the cluster IDs little-endian, to allow a simple To assist in this, I encode the cluster IDs little-endian, to allow a simple
# bit-mask: bit-mask:
# >>> six_bits = cluster & (2**6 - 1) >>> six_bits = cluster & (2**6 - 1)
# """ """
# def __get__(self): def __get__(self):
# return self._seq.data[self.i].lex.cluster return self._seq.data[self.i].lex.cluster
#property string: property string:
# """The unicode string of the word, with no whitespace padding.""" """The unicode string of the word, with no whitespace padding."""
# def __get__(self): def __get__(self):
# cdef const TokenC* t = &self._seq.data[self.i] cdef const TokenC* t = &self._seq.data[self.i]
# if t.lex.sic == 0: if t.lex.sic == 0:
# return '' return ''
# cdef bytes utf8string = self._seq.vocab.strings[t.lex.sic] cdef unicode py_ustr = self._seq.vocab.strings[t.lex.sic]
# return utf8string.decode('utf8') return py_ustr
#property lemma: property lemma:
# """The unicode string of the word's lemma. If no part-of-speech tag is """The unicode string of the word's lemma. If no part-of-speech tag is
# assigned, the most common part-of-speech tag of the word is used. assigned, the most common part-of-speech tag of the word is used.
# """ """
# def __get__(self): def __get__(self):
# cdef const TokenC* t = &self._seq.data[self.i] cdef const TokenC* t = &self._seq.data[self.i]
# if t.lemma == 0: if t.lemma == 0:
# return self.string return self.string
# cdef bytes utf8string = self._seq.vocab.strings[t.lemma] cdef unicode py_ustr = self._seq.vocab.strings[t.lemma]
# return utf8string.decode('utf8') return py_ustr
#property dep_tag: property dep_tag:
# """The ID integer of the word's dependency label. If no parse has been """The ID integer of the word's dependency label. If no parse has been
# assigned, defaults to 0. assigned, defaults to 0.
# """ """
# def __get__(self): def __get__(self):
# return self._seq.data[self.i].dep_tag return self._seq.data[self.i].dep_tag
#property pos: property pos:
# """The ID integer of the word's part-of-speech tag, from the 13-tag """The ID integer of the word's part-of-speech tag, from the 13-tag
# Google Universal Tag Set. Constants for this tag set are available in Google Universal Tag Set. Constants for this tag set are available in
# spacy.typedefs. spacy.typedefs.
# """ """
# def __get__(self): def __get__(self):
# return self._seq.data[self.i].pos return self._seq.data[self.i].pos
#property fine_pos: property fine_pos:
# """The ID integer of the word's fine-grained part-of-speech tag, as assigned """The ID integer of the word's fine-grained part-of-speech tag, as assigned
# by the tagger model. Fine-grained tags include morphological information, by the tagger model. Fine-grained tags include morphological information,
# and other distinctions, and allow a more accurate tagger to be trained. and other distinctions, and allow a more accurate tagger to be trained.
# """ """
# def __get__(self): def __get__(self):
# return self._seq.data[self.i].fine_pos return self._seq.data[self.i].fine_pos
#property sic: property sic:
# def __get__(self): def __get__(self):
# return self._seq.data[self.i].lex.sic return self._seq.vocab.strings[self._seq.data[self.i].lex.sic]
#property head: property head:
# """The token predicted by the parser to be the head of the current token.""" """The token predicted by the parser to be the head of the current token."""
# def __get__(self): def __get__(self):
# cdef const TokenC* t = &self._seq.data[self.i] cdef const TokenC* t = &self._seq.data[self.i]
# return Token(self._seq, self.i + t.head) return Token(self._seq, self.i + t.head)

View File

@ -24,12 +24,13 @@ cdef struct _Cached:
cdef class Vocab: cdef class Vocab:
cpdef public get_lex_props cpdef public lexeme_props_getter
cdef Pool mem cdef Pool mem
cpdef readonly StringStore strings cpdef readonly StringStore strings
cdef vector[LexemeC*] lexemes cdef vector[const LexemeC*] lexemes
cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
cdef PreshMap _map cdef PreshMap _map

View File

@ -5,7 +5,7 @@ from os import path
import codecs import codecs
from .lexeme cimport EMPTY_LEXEME from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport init as lexeme_init from .lexeme cimport set_lex_struct_props
from .lexeme cimport Lexeme_cinit from .lexeme cimport Lexeme_cinit
from .strings cimport slice_unicode from .strings cimport slice_unicode
from .strings cimport hash_string from .strings cimport hash_string
@ -21,24 +21,6 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
EMPTY_LEXEME.vec = EMPTY_VEC EMPTY_LEXEME.vec = EMPTY_VEC
cdef LexemeC init_lexeme(id_t i, unicode string, hash_t hashed,
StringStore string_store, dict props) except *:
cdef LexemeC lex
lex.id = i
lex.length = len(string)
lex.sic = string_store[string]
lex.cluster = props.get('cluster', 0)
lex.prob = props.get('prob', 0)
lex.prefix = string_store[string[:1]]
lex.suffix = string_store[string[-3:]]
lex.shape = string_store[word_shape(string)]
lex.flags = props.get('flags', 0)
return lex
cdef class Vocab: cdef class Vocab:
'''A map container for a language's LexemeC structs. '''A map container for a language's LexemeC structs.
''' '''
@ -47,7 +29,7 @@ cdef class Vocab:
self._map = PreshMap(2 ** 20) self._map = PreshMap(2 ** 20)
self.strings = StringStore() self.strings = StringStore()
self.lexemes.push_back(&EMPTY_LEXEME) self.lexemes.push_back(&EMPTY_LEXEME)
self.get_lex_props = get_lex_props self.lexeme_props_getter = get_lex_props
if data_dir is not None: if data_dir is not None:
if not path.exists(data_dir): if not path.exists(data_dir):
@ -63,32 +45,36 @@ cdef class Vocab:
"""The current number of lexemes stored.""" """The current number of lexemes stored."""
return self.lexemes.size() return self.lexemes.size()
cdef const LexemeC* get(self, Pool mem, UniStr* string) except NULL: cdef const LexemeC* get(self, Pool mem, UniStr* c_str) except NULL:
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
if necessary, using memory acquired from the given pool. If the pool if necessary, using memory acquired from the given pool. If the pool
is the lexicon's own memory, the lexeme is saved in the lexicon.''' is the lexicon's own memory, the lexeme is saved in the lexicon.'''
cdef LexemeC* lex cdef LexemeC* lex
lex = <LexemeC*>self._map.get(string.key) lex = <LexemeC*>self._map.get(c_str.key)
if lex != NULL: if lex != NULL:
return lex return lex
if string.n < 3: if c_str.n < 3:
mem = self.mem mem = self.mem
cdef unicode py_string = string.chars[:string.n] cdef unicode py_str = c_str.chars[:c_str.n]
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1) lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
lex[0] = init_lexeme(self.lexemes.size(), py_string, string.key, self.strings, props = self.lexeme_props_getter(py_str)
self.get_lex_props(py_string)) set_lex_struct_props(lex, props, self.strings)
if mem is self.mem: if mem is self.mem:
self._map.set(string.key, lex) lex.id = self.lexemes.size()
while self.lexemes.size() < (lex.id + 1): self._add_lex_to_vocab(c_str.key, lex)
self.lexemes.push_back(&EMPTY_LEXEME)
self.lexemes[lex.id] = lex
else: else:
lex[0].id = 1 lex.id = 1
return lex return lex
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
self._map.set(key, <void*>lex)
while self.lexemes.size() < (lex.id + 1):
self.lexemes.push_back(&EMPTY_LEXEME)
self.lexemes[lex.id] = lex
def __getitem__(self, id_or_string): def __getitem__(self, id_or_string):
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously '''Retrieve a lexeme, given an int ID or a unicode string. If a previously
unseen unicode string is given, a new LexemeC is created and stored. unseen unicode string is given, a new lexeme is created and stored.
Args: Args:
id_or_string (int or unicode): The integer ID of a word, or its unicode id_or_string (int or unicode): The integer ID of a word, or its unicode
@ -100,24 +86,28 @@ cdef class Vocab:
lexeme (Lexeme): An instance of the Lexeme Python class, with data lexeme (Lexeme): An instance of the Lexeme Python class, with data
copied on instantiation. copied on instantiation.
''' '''
cdef UniStr string cdef UniStr c_str
cdef const LexemeC* lexeme cdef const LexemeC* lexeme
if type(id_or_string) == int: if type(id_or_string) == int:
if id_or_string >= self.lexemes.size(): if id_or_string >= self.lexemes.size():
raise IndexError raise IndexError
lexeme = self.lexemes.at(id_or_string) lexeme = self.lexemes.at(id_or_string)
else: else:
slice_unicode(&string, id_or_string, 0, len(id_or_string)) slice_unicode(&c_str, id_or_string, 0, len(id_or_string))
lexeme = self.get(self.mem, &string) lexeme = self.get(self.mem, &c_str)
return Lexeme_cinit(lexeme, self.strings) return Lexeme_cinit(lexeme, self.strings)
def __setitem__(self, unicode uni_string, dict props): def __setitem__(self, unicode py_str, dict props):
cdef UniStr s cdef UniStr c_str
slice_unicode(&s, uni_string, 0, len(uni_string)) slice_unicode(&c_str, py_str, 0, len(py_str))
# Cast through the const here, since we're allowed to change our own cdef LexemeC* lex
# LexemeCs. lex = <LexemeC*>self._map.get(c_str.key)
lex = <LexemeC*><void*>self.get(self.mem, &s) if lex == NULL:
lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props) lex = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
lex.id = self.lexemes.size()
self._add_lex_to_vocab(c_str.key, lex)
set_lex_struct_props(lex, props, self.strings)
assert lex.sic < 1000000
def dump(self, loc): def dump(self, loc):
if path.exists(loc): if path.exists(loc):
@ -154,6 +144,7 @@ cdef class Vocab:
if st != 1: if st != 1:
break break
lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1) lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
lexeme.vec = EMPTY_VEC
st = fread(lexeme, sizeof(LexemeC), 1, fp) st = fread(lexeme, sizeof(LexemeC), 1, fp)
if st != 1: if st != 1:
break break