* Rewriting Lexeme serialization.

This commit is contained in:
Matthew Honnibal 2014-10-29 23:19:38 +11:00
parent 234d49bf4d
commit 13909a2e24
13 changed files with 147 additions and 79 deletions

View File

@ -13,16 +13,16 @@ import random
from os import path
import re
from cymem.cymem cimport Pool
from cython.operator cimport preincrement as preinc
from cython.operator cimport dereference as deref
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
from cymem.cymem cimport Pool
from murmurhash.mrmr cimport hash64
from preshed.maps cimport PreshMap
from .lexeme cimport Lexeme
from .lexeme cimport from_dict as lexeme_from_dict
from .lexeme cimport from_string as lexeme_from_string
from .lexeme cimport init as lexeme_init
from . import orth
from . import util
@ -232,26 +232,27 @@ cdef class Lexicon:
self.mem = Pool()
self._dict = PreshMap(2 ** 20)
self.strings = StringStore()
self.size = 0
self.size = 1
cdef String string
cdef Lexeme* lexeme
#for py_string, lexeme_dict in lexemes.iteritems():
# string_from_unicode(&string, py_string)
# lexeme = <Lexeme*>self.mem.alloc(1, sizeof(Lexeme))
# lexeme_from_dict(lexeme, lexeme_dict, self.strings)
# self._dict.set(string.key, lexeme)
# self.lexemes.push_back(lexeme)
# self.size += 1
for py_string, lexeme_dict in lexemes.iteritems():
string_from_unicode(&string, py_string)
lexeme = <Lexeme*>self.mem.alloc(1, sizeof(Lexeme))
lexeme[0] = lexeme_init(string.chars[:string.n], string.key, self.size,
self.strings, lexeme_dict)
self._dict.set(lexeme.hash, lexeme)
self.lexemes.push_back(lexeme)
self.size += 1
cdef Lexeme* get(self, String* string) except NULL:
cdef Lexeme* lex
lex = <Lexeme*>self._dict.get(string.key)
if lex != NULL:
return lex
lex = <Lexeme*>self.mem.alloc(1, sizeof(Lexeme))
lexeme_from_string(lex, string.chars[:string.n], self.strings)
self._dict.set(string.key, lex)
lex = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
lex[0] = lexeme_init(string.chars[:string.n], string.key, self.size,
self.strings, {})
self._dict.set(lex.hash, lex)
self.lexemes.push_back(lex)
self.size += 1
return lex
@ -270,6 +271,34 @@ cdef class Lexicon:
cdef Lexeme* lexeme = self.get(&string)
return lexeme[0]
def dump(self, loc):
if path.exists(loc):
assert not path.isdir(loc)
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
cdef FILE* fp = fopen(<char*>bytes_loc, 'wb')
assert fp != NULL
cdef size_t st
for i in range(self.size):
st = fwrite(self.lexemes[i], sizeof(Lexeme), 1, fp)
assert st == 1
st = fclose(fp)
assert st == 0
def load(self, loc):
assert path.exists(loc)
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
cdef FILE* fp = fopen(<char*>bytes_loc, 'rb')
assert fp != NULL
cdef size_t st
cdef Lexeme* lexeme
while True:
lexeme = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
st = fread(lexeme, sizeof(lexeme), 1, fp)
if st == 0:
break
self.lexemes.push_back(lexeme)
self._dict.set(lexeme.hash, lexeme)
cdef void string_from_unicode(String* s, unicode uni):
cdef Py_UNICODE* c_uni = <Py_UNICODE*>uni

View File

@ -23,9 +23,11 @@ cpdef enum:
cdef struct Lexeme:
atom_t id
hash_t hash
atom_t i
atom_t length
atom_t sic
atom_t norm
atom_t shape
atom_t vocab10k
@ -44,12 +46,9 @@ cdef struct Lexeme:
cdef Lexeme EMPTY_LEXEME
cdef int from_string(Lexeme* lex, unicode string, StringStore store) except -1
cdef int from_dict(Lexeme* lex, dict props, StringStore store) except -1
cpdef Lexeme init(unicode string, hash_t hashed, atom_t i,
StringStore store, dict props) except *
cdef inline bint check_flag(Lexeme* lexeme, size_t flag_id) nogil:
return lexeme.flags & (1 << flag_id)

View File

@ -1,5 +1,6 @@
from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool
from murmurhash.mrmr cimport hash64
from libc.string cimport memset
@ -12,7 +13,7 @@ OOV_DIST_FLAGS = 0
memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))
def get_flags(unicode string):
def get_flags(unicode string, float upper_pc, float title_pc, float lower_pc):
cdef flag_t flags = 0
flags |= orth.is_alpha(string) << IS_ALPHA
flags |= orth.is_ascii(string) << IS_ASCII
@ -25,20 +26,36 @@ def get_flags(unicode string):
return flags
cdef int from_string(Lexeme* lex, unicode string, StringStore store) except -1:
cpdef Lexeme init(unicode string, hash_t hashed, atom_t i,
StringStore store, dict props) except *:
cdef Lexeme lex
lex.hash = hashed
lex.i = i
print string, i
lex.length = len(string)
lex.sic = get_string_id(string, store)
lex.cluster = props.get('cluster', 0)
lex.pos = props.get('pos', 0)
lex.supersense = props.get('supersense', 0)
lex.prob = props.get('prob', 0)
cdef float upper_pc = props.get('upper_pc', 0.0)
cdef float lower_pc = props.get('lower_pc', 0.0)
cdef float title_pc = props.get('title_pc', 0.0)
lex.prefix = get_string_id(string[0], store)
lex.suffix = get_string_id(string[-3:], store)
canon_cased = orth.canon_case(string, upper_pc, title_pc, lower_pc)
lex.norm = get_string_id(canon_cased, store)
lex.shape = get_string_id(orth.word_shape(string), store)
lex.asciied = get_string_id(orth.asciied(string), store)
non_sparse = orth.non_sparse(string, lex.prob, lex.cluster, upper_pc, title_pc, lower_pc)
lex.vocab10k = get_string_id(non_sparse, store)
lex.flags = get_flags(string, upper_pc, title_pc, lower_pc)
return lex
cdef atom_t get_string_id(unicode string, StringStore store) except 0:
cdef bytes byte_string = string.encode('utf8')
cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string))
lex.id = orig_str.i
lex.cluster = 0
lex.length = len(string)
lex.flags = get_flags(string)
# TODO: Hook this up
#lex.norm = norm_str.i
#lex.shape = norm_str.i
#lex.asciied = asciied_str.i
#lex.prefix = prefix_str.i
#lex.suffix = suffix_str.i
cdef int from_dict(Lexeme* lex, dict props, StringStore stroe) except -1:
pass
return orig_str.i

View File

@ -64,11 +64,7 @@ def can_tag(name, thresh=0.5):
# String features
def canon_case(string, prob, cluster, case_stats, tag_stats):
upper_pc = case_stats.get('upper', 0.0)
title_pc = case_stats.get('title', 0.0)
lower_pc = case_stats.get('lower', 0.0)
def canon_case(string, upper_pc=0.0, title_pc=0.0, lower_pc=0.0):
if upper_pc >= lower_pc and upper_pc >= title_pc:
return string.upper()
elif title_pc >= lower_pc:
@ -77,7 +73,7 @@ def canon_case(string, prob, cluster, case_stats, tag_stats):
return string.lower()
def word_shape(string, *args):
def word_shape(string):
length = len(string)
shape = []
last = ""
@ -103,15 +99,15 @@ def word_shape(string, *args):
return ''.join(shape)
def non_sparse(string, prob, cluster, case_stats, tag_stats):
def non_sparse(string, prob, cluster, upper_pc, title_pc, lower_pc):
if is_alpha(string):
return canon_case(string, prob, cluster, case_stats, tag_stats)
return canon_case(string, upper_pc, title_pc, lower_pc)
elif prob >= math.log(0.0001):
return string
else:
return word_shape(string, prob, cluster, case_stats, tag_stats)
return word_shape(string)
def asciied(string, prob=0, cluster=0, case_stats=None, tag_stats=None):
def asciied(string):
ascii_string = unidecode(string)
return ascii_string.decode('ascii')

View File

@ -31,10 +31,15 @@ cdef class Tagger:
self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
self._scores = <weight_t*>self.mem.alloc(len(self.tags), sizeof(weight_t))
self._guess = NULL_TAG
if path.exists(path.join(model_dir, 'model.gz')):
with gzip.open(path.join(model_dir, 'model.gz'), 'r') as file_:
self.model.load(file_)
if path.exists(path.join(model_dir, 'model')):
self.model.load(path.join(model_dir, 'model'))
tags_loc = path.join(model_dir, 'postags.json')
if path.exists(tags_loc):
with open(tags_loc) as file_:
Tagger.tags.update(ujson.load(file_))
if path.exists(path.join(model_dir, 'strings')):
EN.lexicon.strings.load(path.join(model_dir, 'strings'))
cpdef class_t predict(self, int i, Tokens tokens, class_t prev, class_t prev_prev) except 0:
assert i >= 0
get_atoms(self._atoms, tokens.lex[i-2], tokens.lex[i-1], tokens.lex[i],
@ -125,7 +130,7 @@ cdef int get_atoms(atom_t* atoms, Lexeme* p2, Lexeme* p1, Lexeme* n0, Lexeme* n1
cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil:
atoms[0] = lex.id
atoms[0] = lex.i
atoms[1] = lex.cluster
atoms[2] = lex.norm
atoms[3] = lex.shape

View File

@ -37,6 +37,7 @@ cdef class Token:
cdef public atom_t lex_pos
cdef public atom_t lex_supersense
cdef public atom_t sic
cdef public atom_t norm
cdef public atom_t shape
cdef public atom_t vocab10k

View File

@ -101,16 +101,18 @@ cdef class Tokens:
@cython.freelist(64)
cdef class Token:
def __init__(self, StringStore string_store, int i, int idx, int pos, dict lex):
assert i < 1000000
self._string_store = string_store
self.i = i
self.id = i
self.idx = idx
self.pos = pos
self.id = lex['id']
self.id = lex['i']
self.cluster = lex['cluster']
self.length = lex['length']
self.lex_pos = lex['pos']
self.lex_supersense = lex['supersense']
self.sic = lex['sic']
self.norm = lex['norm']
self.shape = lex['shape']
self.vocab10k = lex['vocab10k']
@ -122,6 +124,6 @@ cdef class Token:
property string:
def __get__(self):
cdef bytes utf8string = self._string_store[self.id]
cdef bytes utf8string = self._string_store[self.sic]
return utf8string.decode('utf8')

View File

@ -2,6 +2,8 @@ from libc.string cimport memcpy
from murmurhash.mrmr cimport hash64
import ujson
cdef class StringStore:
def __init__(self):
@ -51,3 +53,20 @@ cdef class StringStore:
else:
i = <size_t>value
return &self.strings[i]
def dump(self, loc):
strings = []
cdef Utf8Str* string
cdef bytes py_string
for i in range(self.size):
string = &self.strings[i]
py_string = string.chars[:string.length]
strings.append(py_string)
with open(loc, 'w') as file_:
ujson.dump(strings, file_, ensure_ascii=False)
def load(self, loc):
with open(loc) as file_:
strings = ujson.load(file_)
for string in strings[1:]:
self.intern(string, len(string))

View File

@ -5,16 +5,16 @@ import py.test
from spacy.orth import canon_case as cc
def test_nasa():
assert cc('Nasa', 0.0, 0, {'upper': 0.6, 'title': 0.3, 'lower': 0.1}, {}) == 'NASA'
assert cc('Nasa', 0.6, 0.3, 0.1) == 'NASA'
def test_john():
assert cc('john', 0.0, 0, {'title': 0.6, 'upper': 0.3, 'lower': 0.1}, {}) == 'John'
assert cc('john', 0.3, 0.6, 0.1) == 'John'
def test_apple():
assert cc('apple', 0.0, 0, {'lower': 0.6, 'title': 0.3, 'upper': 0.1}, {}) == 'apple'
assert cc('apple', 0.1, 0.3, 0.6) == 'apple'
def test_tie():
assert cc('I', 0.0, 0, {'lower': 0.0, 'title': 0.0, 'upper': 0.0}, {}) == 'I'
assert cc('I', 0.0, 0.0, 0.0) == 'I'

View File

@ -5,8 +5,8 @@ from spacy.en import EN
def test_possess():
tokens = EN.tokenize("Mike's")
assert EN.lexicon.strings[tokens[0].id] == "Mike"
assert EN.lexicon.strings[tokens[1].id] == "'s"
assert EN.lexicon.strings[tokens[0].sic] == "Mike"
assert EN.lexicon.strings[tokens[1].sic] == "'s"
assert len(tokens) == 2

View File

@ -5,21 +5,21 @@ import math
def test_common_case_upper():
cases = {'upper': 0.7, 'lower': 0.2, 'title': 0.1}
cases = {'u': 0.7, 'l': 0.2, 't': 0.1}
prob = math.log(0.1)
assert non_sparse('usa', prob, 0, cases, {}) == 'USA'
assert non_sparse('usa', prob, 0, cases['u'], cases['t'], cases['l']) == 'USA'
def test_same():
cases = {'upper': 0.01, 'title': 0.09, 'lower': 0.9}
cases = {'u': 0.01, 't': 0.09, 'l': 0.9}
prob = math.log(0.5)
assert non_sparse('the', prob, 0, cases, {}) == 'the'
assert non_sparse('the', prob, 0, cases['u'], cases['t'], cases['l']) == 'the'
def test_common_case_lower():
prob = math.log(0.5)
cases = {'upper': 0.01, 'title': 0.09, 'lower': 0.9}
assert non_sparse('The', prob, 0, cases, {}) == 'the'
cases = {'u': 0.01, 't': 0.09, 'l': 0.9}
assert non_sparse('The', prob, 0, cases['u'], cases['t'], cases['l']) == 'the'
def test_shape():
prob = math.log(0.00001)
cases = {'upper': 0.0, 'title': 0.0, 'lower': 0.0}
assert non_sparse('1999', prob, 0, cases, {}) == 'dddd'
cases = {'u': 0.0, 't': 0.0, 'l': 0.0}
assert non_sparse('1999', prob, 0, cases['u'], cases['t'], cases['l']) == 'dddd'

View File

@ -27,17 +27,17 @@ def test_punct():
def test_digits():
tokens = EN.tokenize('The year: 1984.')
assert len(tokens) == 5
assert tokens[0].id == EN.lexicon.lookup('The')['id']
assert tokens[3].id == EN.lexicon.lookup('1984')['id']
assert tokens[0].sic == EN.lexicon.lookup('The')['sic']
assert tokens[3].sic == EN.lexicon.lookup('1984')['sic']
def test_contraction():
tokens = EN.tokenize("don't giggle")
assert len(tokens) == 3
assert tokens[1].id == EN.lexicon.lookup("not")['id']
assert tokens[1].sic == EN.lexicon.lookup("not")['sic']
tokens = EN.tokenize("i said don't!")
assert len(tokens) == 5
assert tokens[4].id == EN.lexicon.lookup('!')['id']
assert tokens[4].sic == EN.lexicon.lookup('!')['sic']
def test_contraction_punct():

View File

@ -5,19 +5,19 @@ from spacy.en import EN
def test_neq():
addr = EN.lexicon.lookup('Hello')
assert EN.lexicon.lookup('bye')['id'] != addr['id']
assert EN.lexicon.lookup('bye')['sic'] != addr['sic']
def test_eq():
addr = EN.lexicon.lookup('Hello')
assert EN.lexicon.lookup('Hello')['id'] == addr['id']
assert EN.lexicon.lookup('Hello')['sic'] == addr['sic']
def test_case_neq():
addr = EN.lexicon.lookup('Hello')
assert EN.lexicon.lookup('hello')['id'] != addr['id']
assert EN.lexicon.lookup('hello')['sic'] != addr['sic']
def test_punct_neq():
addr = EN.lexicon.lookup('Hello')
assert EN.lexicon.lookup('Hello,')['id'] != addr['id']
assert EN.lexicon.lookup('Hello,')['sic'] != addr['sic']