* Tmp. Working on refactor. Compiles, must hook up lexical feats.

This commit is contained in:
Matthew Honnibal 2015-01-14 00:03:48 +11:00
parent 46da3d74d2
commit 0930892fc1
9 changed files with 150 additions and 196 deletions

View File

@ -12,7 +12,10 @@ from .attrs import get_flags
def get_lex_props(string):
return {'flags': get_flags(string), 'dense': 1}
return {'flags': get_flags(string), 'length': len(string),
'sic': string, 'norm1': string, 'norm2': string, 'shape': string,
'prefix': string[0], 'suffix': string[-3:], 'cluster': 0, 'prob': 0,
'sentiment': 0}
LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')
@ -45,7 +48,7 @@ class English(object):
"""
def __init__(self, data_dir=LOCAL_DATA_DIR):
self._data_dir = data_dir
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab'),
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
get_lex_props=get_lex_props)
tag_names = list(POS_TAGS.keys())
tag_names.sort()

View File

@ -283,12 +283,12 @@ cdef class EnPosTagger:
cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1:
if self.lemmatizer is None:
return lex.sic
cdef bytes py_string = self.strings[lex.sic]
cdef unicode py_string = self.strings[lex.sic]
if pos != NOUN and pos != VERB and pos != ADJ:
return lex.sic
cdef set lemma_strings
cdef unicode lemma_string
lemma_strings = self.lemmatizer(py_string.decode('utf8'), pos)
lemma_strings = self.lemmatizer(py_string, pos)
lemma_string = sorted(lemma_strings)[0]
lemma = self.strings.intern(lemma_string.encode('utf8'), len(lemma_string)).i
return lemma

View File

@ -7,9 +7,7 @@ from .strings cimport StringStore
cdef LexemeC EMPTY_LEXEME
cdef LexemeC init(id_t i, unicode string, hash_t hashed, StringStore store,
dict props) except *
cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore strings) except -1
cdef class Lexeme:
cdef const float* vec

View File

@ -5,27 +5,27 @@ from murmurhash.mrmr cimport hash64
from libc.string cimport memset
from .orth cimport word_shape
from .typedefs cimport attr_t
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
cdef LexemeC init(id_t i, unicode string, hash_t hashed,
StringStore string_store, dict props) except *:
cdef LexemeC lex
lex.id = i
lex.length = len(string)
lex.sic = string_store[string]
cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store) except -1:
lex.cluster = props.get('cluster', 0)
lex.prob = props.get('prob', 0)
lex.length = props['length']
lex.sic = string_store[props['sic']]
lex.norm1 = string_store[props['norm1']]
lex.norm2 = string_store[props['norm2']]
lex.shape = string_store[props['shape']]
lex.prefix = string_store[props['prefix']]
lex.suffix = string_store[props['suffix']]
lex.prefix = string_store[string[:1]]
lex.suffix = string_store[string[-3:]]
lex.shape = string_store[word_shape(string)]
lex.cluster = props['cluster']
lex.prob = props['prob']
lex.sentiment = props['sentiment']
lex.flags = props.get('flags', 0)
return lex
lex.flags = props['flags']
cdef class Lexeme:

View File

@ -67,7 +67,7 @@ cdef class StringStore:
if string_or_id < 1 or string_or_id >= self.size:
raise IndexError(string_or_id)
utf8str = &self.strings[<int>string_or_id]
return utf8str.chars[:utf8str.length]
return utf8str.chars[:utf8str.length].decode('utf8')
elif isinstance(string_or_id, bytes):
utf8str = self.intern(<char*>string_or_id, len(string_or_id))
return utf8str.i

View File

@ -42,32 +42,5 @@ cdef class Tokens:
cdef class Token:
cdef cvarray vec
cdef readonly flags_t flags
cdef readonly attr_t id
cdef readonly attr_t sic
cdef readonly attr_t dense
cdef readonly attr_t shape
cdef readonly attr_t prefix
cdef readonly attr_t suffix
cdef readonly attr_t length
cdef readonly attr_t cluster
cdef readonly attr_t pos_type
cdef readonly float prob
cdef readonly float sentiment
cdef readonly Morphology morph
cdef readonly univ_tag_t pos
cdef readonly int fine_pos
cdef readonly int idx
cdef readonly int lemma
cdef readonly int sense
cdef readonly int dep_tag
cdef readonly int head_offset
cdef readonly uint32_t l_kids
cdef readonly uint32_t r_kids
cdef readonly Tokens _seq
cdef readonly int i

View File

@ -85,7 +85,7 @@ cdef class Tokens:
token (Token):
"""
bounds_check(i, self.length, PADDING)
return cinit_token(&self.data[i])
return Token(self, i)
def __iter__(self):
"""Iterate over the tokens.
@ -174,38 +174,26 @@ cdef class Tokens:
self.data[i].lex = &EMPTY_LEXEME
cdef Token cinit_token(const TokenC* c_tok):
cdef Token py_tok = Token.__new__(Token)
py_tok.morph = c_tok.morph
py_tok.pos = c_tok.pos
py_tok.fine_pos = c_tok.fine_pos
py_tok.idx = c_tok.idx
py_tok.lemma = c_tok.lemma
py_tok.sense = c_tok.sense
py_tok.dep_tag = c_tok.dep_tag
py_tok.head_offset = c_tok.head
py_tok.l_kids = c_tok.l_kids
py_tok.r_kids = c_tok.r_kids
return py_tok
@cython.freelist(64)
cdef class Token:
"""An individual token.
"""
def __init__(self):
pass
#self._seq = tokens
#self.i = i
#def __unicode__(self):
# cdef const TokenC* t = &self._seq.data[self.i]
# cdef int end_idx = t.idx + t.lex.length
# if self.i + 1 == self._seq.length:
# return self.string
# if end_idx == t[1].idx:
# return self.string
# else:
# return self.string + ' '
Internally, the Token is a tuple (i, tokens) --- it delegates to the Tokens
object.
"""
def __init__(self, Tokens tokens, int i):
self._seq = tokens
self.i = i
def __unicode__(self):
cdef const TokenC* t = &self._seq.data[self.i]
cdef int end_idx = t.idx + t.lex.length
if self.i + 1 == self._seq.length:
return self.string
if end_idx == t[1].idx:
return self.string
else:
return self.string + ' '
def __len__(self):
"""The number of unicode code-points in the original string.
@ -213,87 +201,87 @@ cdef class Token:
Returns:
length (int):
"""
return self.length
return self._seq.data[self.i].lex.length
#property idx:
# """The index into the original string at which the token starts.
property idx:
"""The index into the original string at which the token starts.
# The following is supposed to always be true:
#
# >>> original_string[token.idx:token.idx len(token) == token.string
# """
# def __get__(self):
# return self._seq.data[self.i].idx
The following is supposed to always be true:
#property cluster:
# """The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering
#
# Similar words have better-than-chance likelihood of having similar cluster
# IDs, although the clustering is quite noisy. Cluster IDs make good features,
# and help to make models slightly more robust to domain variation.
>>> original_string[token.idx:token.idx len(token) == token.string
"""
def __get__(self):
return self._seq.data[self.i].idx
# A common trick is to use only the first N bits of a cluster ID in a feature,
# as the more general part of the hierarchical clustering is often more accurate
# than the lower categories.
property cluster:
"""The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering
# To assist in this, I encode the cluster IDs little-endian, to allow a simple
# bit-mask:
Similar words have better-than-chance likelihood of having similar cluster
IDs, although the clustering is quite noisy. Cluster IDs make good features,
and help to make models slightly more robust to domain variation.
# >>> six_bits = cluster & (2**6 - 1)
# """
# def __get__(self):
# return self._seq.data[self.i].lex.cluster
A common trick is to use only the first N bits of a cluster ID in a feature,
as the more general part of the hierarchical clustering is often more accurate
than the lower categories.
#property string:
# """The unicode string of the word, with no whitespace padding."""
# def __get__(self):
# cdef const TokenC* t = &self._seq.data[self.i]
# if t.lex.sic == 0:
# return ''
# cdef bytes utf8string = self._seq.vocab.strings[t.lex.sic]
# return utf8string.decode('utf8')
To assist in this, I encode the cluster IDs little-endian, to allow a simple
bit-mask:
#property lemma:
# """The unicode string of the word's lemma. If no part-of-speech tag is
# assigned, the most common part-of-speech tag of the word is used.
# """
# def __get__(self):
# cdef const TokenC* t = &self._seq.data[self.i]
# if t.lemma == 0:
# return self.string
# cdef bytes utf8string = self._seq.vocab.strings[t.lemma]
# return utf8string.decode('utf8')
>>> six_bits = cluster & (2**6 - 1)
"""
def __get__(self):
return self._seq.data[self.i].lex.cluster
#property dep_tag:
# """The ID integer of the word's dependency label. If no parse has been
# assigned, defaults to 0.
# """
# def __get__(self):
# return self._seq.data[self.i].dep_tag
property string:
"""The unicode string of the word, with no whitespace padding."""
def __get__(self):
cdef const TokenC* t = &self._seq.data[self.i]
if t.lex.sic == 0:
return ''
cdef unicode py_ustr = self._seq.vocab.strings[t.lex.sic]
return py_ustr
#property pos:
# """The ID integer of the word's part-of-speech tag, from the 13-tag
# Google Universal Tag Set. Constants for this tag set are available in
# spacy.typedefs.
# """
# def __get__(self):
# return self._seq.data[self.i].pos
property lemma:
"""The unicode string of the word's lemma. If no part-of-speech tag is
assigned, the most common part-of-speech tag of the word is used.
"""
def __get__(self):
cdef const TokenC* t = &self._seq.data[self.i]
if t.lemma == 0:
return self.string
cdef unicode py_ustr = self._seq.vocab.strings[t.lemma]
return py_ustr
#property fine_pos:
# """The ID integer of the word's fine-grained part-of-speech tag, as assigned
# by the tagger model. Fine-grained tags include morphological information,
# and other distinctions, and allow a more accurate tagger to be trained.
# """
property dep_tag:
"""The ID integer of the word's dependency label. If no parse has been
assigned, defaults to 0.
"""
def __get__(self):
return self._seq.data[self.i].dep_tag
# def __get__(self):
# return self._seq.data[self.i].fine_pos
property pos:
"""The ID integer of the word's part-of-speech tag, from the 13-tag
Google Universal Tag Set. Constants for this tag set are available in
spacy.typedefs.
"""
def __get__(self):
return self._seq.data[self.i].pos
#property sic:
# def __get__(self):
# return self._seq.data[self.i].lex.sic
property fine_pos:
"""The ID integer of the word's fine-grained part-of-speech tag, as assigned
by the tagger model. Fine-grained tags include morphological information,
and other distinctions, and allow a more accurate tagger to be trained.
"""
#property head:
# """The token predicted by the parser to be the head of the current token."""
# def __get__(self):
# cdef const TokenC* t = &self._seq.data[self.i]
# return Token(self._seq, self.i + t.head)
def __get__(self):
return self._seq.data[self.i].fine_pos
property sic:
def __get__(self):
return self._seq.vocab.strings[self._seq.data[self.i].lex.sic]
property head:
"""The token predicted by the parser to be the head of the current token."""
def __get__(self):
cdef const TokenC* t = &self._seq.data[self.i]
return Token(self._seq, self.i + t.head)

View File

@ -24,12 +24,13 @@ cdef struct _Cached:
cdef class Vocab:
cpdef public get_lex_props
cpdef public lexeme_props_getter
cdef Pool mem
cpdef readonly StringStore strings
cdef vector[LexemeC*] lexemes
cdef vector[const LexemeC*] lexemes
cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
cdef PreshMap _map

View File

@ -5,7 +5,7 @@ from os import path
import codecs
from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport init as lexeme_init
from .lexeme cimport set_lex_struct_props
from .lexeme cimport Lexeme_cinit
from .strings cimport slice_unicode
from .strings cimport hash_string
@ -21,24 +21,6 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
EMPTY_LEXEME.vec = EMPTY_VEC
cdef LexemeC init_lexeme(id_t i, unicode string, hash_t hashed,
StringStore string_store, dict props) except *:
cdef LexemeC lex
lex.id = i
lex.length = len(string)
lex.sic = string_store[string]
lex.cluster = props.get('cluster', 0)
lex.prob = props.get('prob', 0)
lex.prefix = string_store[string[:1]]
lex.suffix = string_store[string[-3:]]
lex.shape = string_store[word_shape(string)]
lex.flags = props.get('flags', 0)
return lex
cdef class Vocab:
'''A map container for a language's LexemeC structs.
'''
@ -47,7 +29,7 @@ cdef class Vocab:
self._map = PreshMap(2 ** 20)
self.strings = StringStore()
self.lexemes.push_back(&EMPTY_LEXEME)
self.get_lex_props = get_lex_props
self.lexeme_props_getter = get_lex_props
if data_dir is not None:
if not path.exists(data_dir):
@ -63,32 +45,36 @@ cdef class Vocab:
"""The current number of lexemes stored."""
return self.lexemes.size()
cdef const LexemeC* get(self, Pool mem, UniStr* string) except NULL:
cdef const LexemeC* get(self, Pool mem, UniStr* c_str) except NULL:
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
if necessary, using memory acquired from the given pool. If the pool
is the lexicon's own memory, the lexeme is saved in the lexicon.'''
cdef LexemeC* lex
lex = <LexemeC*>self._map.get(string.key)
lex = <LexemeC*>self._map.get(c_str.key)
if lex != NULL:
return lex
if string.n < 3:
if c_str.n < 3:
mem = self.mem
cdef unicode py_string = string.chars[:string.n]
cdef unicode py_str = c_str.chars[:c_str.n]
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
lex[0] = init_lexeme(self.lexemes.size(), py_string, string.key, self.strings,
self.get_lex_props(py_string))
props = self.lexeme_props_getter(py_str)
set_lex_struct_props(lex, props, self.strings)
if mem is self.mem:
self._map.set(string.key, lex)
lex.id = self.lexemes.size()
self._add_lex_to_vocab(c_str.key, lex)
else:
lex.id = 1
return lex
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
self._map.set(key, <void*>lex)
while self.lexemes.size() < (lex.id + 1):
self.lexemes.push_back(&EMPTY_LEXEME)
self.lexemes[lex.id] = lex
else:
lex[0].id = 1
return lex
def __getitem__(self, id_or_string):
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously
unseen unicode string is given, a new LexemeC is created and stored.
unseen unicode string is given, a new lexeme is created and stored.
Args:
id_or_string (int or unicode): The integer ID of a word, or its unicode
@ -100,24 +86,28 @@ cdef class Vocab:
lexeme (Lexeme): An instance of the Lexeme Python class, with data
copied on instantiation.
'''
cdef UniStr string
cdef UniStr c_str
cdef const LexemeC* lexeme
if type(id_or_string) == int:
if id_or_string >= self.lexemes.size():
raise IndexError
lexeme = self.lexemes.at(id_or_string)
else:
slice_unicode(&string, id_or_string, 0, len(id_or_string))
lexeme = self.get(self.mem, &string)
slice_unicode(&c_str, id_or_string, 0, len(id_or_string))
lexeme = self.get(self.mem, &c_str)
return Lexeme_cinit(lexeme, self.strings)
def __setitem__(self, unicode uni_string, dict props):
cdef UniStr s
slice_unicode(&s, uni_string, 0, len(uni_string))
# Cast through the const here, since we're allowed to change our own
# LexemeCs.
lex = <LexemeC*><void*>self.get(self.mem, &s)
lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props)
def __setitem__(self, unicode py_str, dict props):
cdef UniStr c_str
slice_unicode(&c_str, py_str, 0, len(py_str))
cdef LexemeC* lex
lex = <LexemeC*>self._map.get(c_str.key)
if lex == NULL:
lex = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
lex.id = self.lexemes.size()
self._add_lex_to_vocab(c_str.key, lex)
set_lex_struct_props(lex, props, self.strings)
assert lex.sic < 1000000
def dump(self, loc):
if path.exists(loc):
@ -154,6 +144,7 @@ cdef class Vocab:
if st != 1:
break
lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
lexeme.vec = EMPTY_VEC
st = fread(lexeme, sizeof(LexemeC), 1, fp)
if st != 1:
break