diff --git a/spacy/lang.pyx b/spacy/lang.pyx index d106f172a..5b5892fdc 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -13,16 +13,16 @@ import random from os import path import re -from cymem.cymem cimport Pool from cython.operator cimport preincrement as preinc from cython.operator cimport dereference as deref +from libc.stdio cimport fopen, fclose, fread, fwrite, FILE +from cymem.cymem cimport Pool from murmurhash.mrmr cimport hash64 from preshed.maps cimport PreshMap from .lexeme cimport Lexeme -from .lexeme cimport from_dict as lexeme_from_dict -from .lexeme cimport from_string as lexeme_from_string +from .lexeme cimport init as lexeme_init from . import orth from . import util @@ -232,26 +232,27 @@ cdef class Lexicon: self.mem = Pool() self._dict = PreshMap(2 ** 20) self.strings = StringStore() - self.size = 0 + self.size = 1 cdef String string cdef Lexeme* lexeme - #for py_string, lexeme_dict in lexemes.iteritems(): - # string_from_unicode(&string, py_string) - # lexeme = self.mem.alloc(1, sizeof(Lexeme)) - # lexeme_from_dict(lexeme, lexeme_dict, self.strings) - # self._dict.set(string.key, lexeme) - # self.lexemes.push_back(lexeme) - # self.size += 1 + for py_string, lexeme_dict in lexemes.iteritems(): + string_from_unicode(&string, py_string) + lexeme = self.mem.alloc(1, sizeof(Lexeme)) + lexeme[0] = lexeme_init(string.chars[:string.n], string.key, self.size, + self.strings, lexeme_dict) + self._dict.set(lexeme.hash, lexeme) + self.lexemes.push_back(lexeme) + self.size += 1 cdef Lexeme* get(self, String* string) except NULL: cdef Lexeme* lex lex = self._dict.get(string.key) if lex != NULL: return lex - - lex = self.mem.alloc(1, sizeof(Lexeme)) - lexeme_from_string(lex, string.chars[:string.n], self.strings) - self._dict.set(string.key, lex) + lex = self.mem.alloc(sizeof(Lexeme), 1) + lex[0] = lexeme_init(string.chars[:string.n], string.key, self.size, + self.strings, {}) + self._dict.set(lex.hash, lex) self.lexemes.push_back(lex) self.size += 1 return lex @@ -270,6 +271,34 @@ cdef class Lexicon: cdef Lexeme* lexeme = self.get(&string) return lexeme[0] + def dump(self, loc): + if path.exists(loc): + assert not path.isdir(loc) + cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc + cdef FILE* fp = fopen(bytes_loc, 'wb') + assert fp != NULL + cdef size_t st + for i in range(self.size): + st = fwrite(self.lexemes[i], sizeof(Lexeme), 1, fp) + assert st == 1 + st = fclose(fp) + assert st == 0 + + def load(self, loc): + assert path.exists(loc) + cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc + cdef FILE* fp = fopen(bytes_loc, 'rb') + assert fp != NULL + cdef size_t st + cdef Lexeme* lexeme + while True: + lexeme = self.mem.alloc(sizeof(Lexeme), 1) + st = fread(lexeme, sizeof(lexeme), 1, fp) + if st == 0: + break + self.lexemes.push_back(lexeme) + self._dict.set(lexeme.hash, lexeme) + cdef void string_from_unicode(String* s, unicode uni): cdef Py_UNICODE* c_uni = uni diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 3cd65c995..235883e2a 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -23,9 +23,11 @@ cpdef enum: cdef struct Lexeme: - atom_t id + hash_t hash + atom_t i atom_t length - + + atom_t sic atom_t norm atom_t shape atom_t vocab10k @@ -44,12 +46,9 @@ cdef struct Lexeme: cdef Lexeme EMPTY_LEXEME - -cdef int from_string(Lexeme* lex, unicode string, StringStore store) except -1 - - -cdef int from_dict(Lexeme* lex, dict props, StringStore store) except -1 - +cpdef Lexeme init(unicode string, hash_t hashed, atom_t i, + StringStore store, dict props) except * + cdef inline bint check_flag(Lexeme* lexeme, size_t flag_id) nogil: return lexeme.flags & (1 << flag_id) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index d442a262e..03c6e2270 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -1,5 +1,6 @@ from cpython.ref cimport Py_INCREF from cymem.cymem cimport Pool +from murmurhash.mrmr cimport hash64 from libc.string cimport memset @@ -12,7 +13,7 @@ OOV_DIST_FLAGS = 0 memset(&EMPTY_LEXEME, 0, sizeof(Lexeme)) -def get_flags(unicode string): +def get_flags(unicode string, float upper_pc, float title_pc, float lower_pc): cdef flag_t flags = 0 flags |= orth.is_alpha(string) << IS_ALPHA flags |= orth.is_ascii(string) << IS_ASCII @@ -25,20 +26,36 @@ def get_flags(unicode string): return flags -cdef int from_string(Lexeme* lex, unicode string, StringStore store) except -1: +cpdef Lexeme init(unicode string, hash_t hashed, atom_t i, + StringStore store, dict props) except *: + cdef Lexeme lex + lex.hash = hashed + lex.i = i + print string, i + lex.length = len(string) + lex.sic = get_string_id(string, store) + + lex.cluster = props.get('cluster', 0) + lex.pos = props.get('pos', 0) + lex.supersense = props.get('supersense', 0) + lex.prob = props.get('prob', 0) + + cdef float upper_pc = props.get('upper_pc', 0.0) + cdef float lower_pc = props.get('lower_pc', 0.0) + cdef float title_pc = props.get('title_pc', 0.0) + + lex.prefix = get_string_id(string[0], store) + lex.suffix = get_string_id(string[-3:], store) + canon_cased = orth.canon_case(string, upper_pc, title_pc, lower_pc) + lex.norm = get_string_id(canon_cased, store) + lex.shape = get_string_id(orth.word_shape(string), store) + lex.asciied = get_string_id(orth.asciied(string), store) + non_sparse = orth.non_sparse(string, lex.prob, lex.cluster, upper_pc, title_pc, lower_pc) + lex.vocab10k = get_string_id(non_sparse, store) + lex.flags = get_flags(string, upper_pc, title_pc, lower_pc) + return lex + +cdef atom_t get_string_id(unicode string, StringStore store) except 0: cdef bytes byte_string = string.encode('utf8') cdef Utf8Str* orig_str = store.intern(byte_string, len(byte_string)) - lex.id = orig_str.i - lex.cluster = 0 - lex.length = len(string) - lex.flags = get_flags(string) - # TODO: Hook this up - #lex.norm = norm_str.i - #lex.shape = norm_str.i - #lex.asciied = asciied_str.i - #lex.prefix = prefix_str.i - #lex.suffix = suffix_str.i - - -cdef int from_dict(Lexeme* lex, dict props, StringStore stroe) except -1: - pass + return orig_str.i diff --git a/spacy/orth.py b/spacy/orth.py index 4bec8d665..13ae2c0c4 100644 --- a/spacy/orth.py +++ b/spacy/orth.py @@ -64,11 +64,7 @@ def can_tag(name, thresh=0.5): # String features -def canon_case(string, prob, cluster, case_stats, tag_stats): - upper_pc = case_stats.get('upper', 0.0) - title_pc = case_stats.get('title', 0.0) - lower_pc = case_stats.get('lower', 0.0) - +def canon_case(string, upper_pc=0.0, title_pc=0.0, lower_pc=0.0): if upper_pc >= lower_pc and upper_pc >= title_pc: return string.upper() elif title_pc >= lower_pc: @@ -77,7 +73,7 @@ def canon_case(string, prob, cluster, case_stats, tag_stats): return string.lower() -def word_shape(string, *args): +def word_shape(string): length = len(string) shape = [] last = "" @@ -103,15 +99,15 @@ def word_shape(string, *args): return ''.join(shape) -def non_sparse(string, prob, cluster, case_stats, tag_stats): +def non_sparse(string, prob, cluster, upper_pc, title_pc, lower_pc): if is_alpha(string): - return canon_case(string, prob, cluster, case_stats, tag_stats) + return canon_case(string, upper_pc, title_pc, lower_pc) elif prob >= math.log(0.0001): return string else: - return word_shape(string, prob, cluster, case_stats, tag_stats) + return word_shape(string) -def asciied(string, prob=0, cluster=0, case_stats=None, tag_stats=None): +def asciied(string): ascii_string = unidecode(string) return ascii_string.decode('ascii') diff --git a/spacy/pos.pyx b/spacy/pos.pyx index 0e79cddd7..263f88edb 100644 --- a/spacy/pos.pyx +++ b/spacy/pos.pyx @@ -31,10 +31,15 @@ cdef class Tagger: self._values = self.mem.alloc(self.extractor.n+1, sizeof(weight_t)) self._scores = self.mem.alloc(len(self.tags), sizeof(weight_t)) self._guess = NULL_TAG - if path.exists(path.join(model_dir, 'model.gz')): - with gzip.open(path.join(model_dir, 'model.gz'), 'r') as file_: - self.model.load(file_) - + if path.exists(path.join(model_dir, 'model')): + self.model.load(path.join(model_dir, 'model')) + tags_loc = path.join(model_dir, 'postags.json') + if path.exists(tags_loc): + with open(tags_loc) as file_: + Tagger.tags.update(ujson.load(file_)) + if path.exists(path.join(model_dir, 'strings')): + EN.lexicon.strings.load(path.join(model_dir, 'strings')) + cpdef class_t predict(self, int i, Tokens tokens, class_t prev, class_t prev_prev) except 0: assert i >= 0 get_atoms(self._atoms, tokens.lex[i-2], tokens.lex[i-1], tokens.lex[i], @@ -125,7 +130,7 @@ cdef int get_atoms(atom_t* atoms, Lexeme* p2, Lexeme* p1, Lexeme* n0, Lexeme* n1 cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil: - atoms[0] = lex.id + atoms[0] = lex.i atoms[1] = lex.cluster atoms[2] = lex.norm atoms[3] = lex.shape diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index 9847cdc3c..d6b655074 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -37,6 +37,7 @@ cdef class Token: cdef public atom_t lex_pos cdef public atom_t lex_supersense + cdef public atom_t sic cdef public atom_t norm cdef public atom_t shape cdef public atom_t vocab10k diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 56ffc343f..6abfd5b6a 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -101,16 +101,18 @@ cdef class Tokens: @cython.freelist(64) cdef class Token: def __init__(self, StringStore string_store, int i, int idx, int pos, dict lex): + assert i < 1000000 self._string_store = string_store - self.i = i + self.id = i self.idx = idx self.pos = pos - self.id = lex['id'] + self.id = lex['i'] self.cluster = lex['cluster'] self.length = lex['length'] self.lex_pos = lex['pos'] self.lex_supersense = lex['supersense'] + self.sic = lex['sic'] self.norm = lex['norm'] self.shape = lex['shape'] self.vocab10k = lex['vocab10k'] @@ -122,6 +124,6 @@ cdef class Token: property string: def __get__(self): - cdef bytes utf8string = self._string_store[self.id] + cdef bytes utf8string = self._string_store[self.sic] return utf8string.decode('utf8') diff --git a/spacy/utf8string.pyx b/spacy/utf8string.pyx index d3bc3a4fe..8cb2bebd2 100644 --- a/spacy/utf8string.pyx +++ b/spacy/utf8string.pyx @@ -2,6 +2,8 @@ from libc.string cimport memcpy from murmurhash.mrmr cimport hash64 +import ujson + cdef class StringStore: def __init__(self): @@ -51,3 +53,20 @@ cdef class StringStore: else: i = value return &self.strings[i] + + def dump(self, loc): + strings = [] + cdef Utf8Str* string + cdef bytes py_string + for i in range(self.size): + string = &self.strings[i] + py_string = string.chars[:string.length] + strings.append(py_string) + with open(loc, 'w') as file_: + ujson.dump(strings, file_, ensure_ascii=False) + + def load(self, loc): + with open(loc) as file_: + strings = ujson.load(file_) + for string in strings[1:]: + self.intern(string, len(string)) diff --git a/tests/test_canon_case.py b/tests/test_canon_case.py index 4b0fd21b3..2c8dd255b 100644 --- a/tests/test_canon_case.py +++ b/tests/test_canon_case.py @@ -5,16 +5,16 @@ import py.test from spacy.orth import canon_case as cc def test_nasa(): - assert cc('Nasa', 0.0, 0, {'upper': 0.6, 'title': 0.3, 'lower': 0.1}, {}) == 'NASA' + assert cc('Nasa', 0.6, 0.3, 0.1) == 'NASA' def test_john(): - assert cc('john', 0.0, 0, {'title': 0.6, 'upper': 0.3, 'lower': 0.1}, {}) == 'John' + assert cc('john', 0.3, 0.6, 0.1) == 'John' def test_apple(): - assert cc('apple', 0.0, 0, {'lower': 0.6, 'title': 0.3, 'upper': 0.1}, {}) == 'apple' + assert cc('apple', 0.1, 0.3, 0.6) == 'apple' def test_tie(): - assert cc('I', 0.0, 0, {'lower': 0.0, 'title': 0.0, 'upper': 0.0}, {}) == 'I' + assert cc('I', 0.0, 0.0, 0.0) == 'I' diff --git a/tests/test_contractions.py b/tests/test_contractions.py index 5a2eaf3a9..b7347a617 100644 --- a/tests/test_contractions.py +++ b/tests/test_contractions.py @@ -5,8 +5,8 @@ from spacy.en import EN def test_possess(): tokens = EN.tokenize("Mike's") - assert EN.lexicon.strings[tokens[0].id] == "Mike" - assert EN.lexicon.strings[tokens[1].id] == "'s" + assert EN.lexicon.strings[tokens[0].sic] == "Mike" + assert EN.lexicon.strings[tokens[1].sic] == "'s" assert len(tokens) == 2 diff --git a/tests/test_non_sparse.py b/tests/test_non_sparse.py index a7a05adf1..c7b75de85 100644 --- a/tests/test_non_sparse.py +++ b/tests/test_non_sparse.py @@ -5,21 +5,21 @@ import math def test_common_case_upper(): - cases = {'upper': 0.7, 'lower': 0.2, 'title': 0.1} + cases = {'u': 0.7, 'l': 0.2, 't': 0.1} prob = math.log(0.1) - assert non_sparse('usa', prob, 0, cases, {}) == 'USA' + assert non_sparse('usa', prob, 0, cases['u'], cases['t'], cases['l']) == 'USA' def test_same(): - cases = {'upper': 0.01, 'title': 0.09, 'lower': 0.9} + cases = {'u': 0.01, 't': 0.09, 'l': 0.9} prob = math.log(0.5) - assert non_sparse('the', prob, 0, cases, {}) == 'the' + assert non_sparse('the', prob, 0, cases['u'], cases['t'], cases['l']) == 'the' def test_common_case_lower(): prob = math.log(0.5) - cases = {'upper': 0.01, 'title': 0.09, 'lower': 0.9} - assert non_sparse('The', prob, 0, cases, {}) == 'the' + cases = {'u': 0.01, 't': 0.09, 'l': 0.9} + assert non_sparse('The', prob, 0, cases['u'], cases['t'], cases['l']) == 'the' def test_shape(): prob = math.log(0.00001) - cases = {'upper': 0.0, 'title': 0.0, 'lower': 0.0} - assert non_sparse('1999', prob, 0, cases, {}) == 'dddd' + cases = {'u': 0.0, 't': 0.0, 'l': 0.0} + assert non_sparse('1999', prob, 0, cases['u'], cases['t'], cases['l']) == 'dddd' diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 73ac91261..4624e2828 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -27,17 +27,17 @@ def test_punct(): def test_digits(): tokens = EN.tokenize('The year: 1984.') assert len(tokens) == 5 - assert tokens[0].id == EN.lexicon.lookup('The')['id'] - assert tokens[3].id == EN.lexicon.lookup('1984')['id'] + assert tokens[0].sic == EN.lexicon.lookup('The')['sic'] + assert tokens[3].sic == EN.lexicon.lookup('1984')['sic'] def test_contraction(): tokens = EN.tokenize("don't giggle") assert len(tokens) == 3 - assert tokens[1].id == EN.lexicon.lookup("not")['id'] + assert tokens[1].sic == EN.lexicon.lookup("not")['sic'] tokens = EN.tokenize("i said don't!") assert len(tokens) == 5 - assert tokens[4].id == EN.lexicon.lookup('!')['id'] + assert tokens[4].sic == EN.lexicon.lookup('!')['sic'] def test_contraction_punct(): diff --git a/tests/test_vocab.py b/tests/test_vocab.py index 640fa5041..036e5981c 100644 --- a/tests/test_vocab.py +++ b/tests/test_vocab.py @@ -5,19 +5,19 @@ from spacy.en import EN def test_neq(): addr = EN.lexicon.lookup('Hello') - assert EN.lexicon.lookup('bye')['id'] != addr['id'] + assert EN.lexicon.lookup('bye')['sic'] != addr['sic'] def test_eq(): addr = EN.lexicon.lookup('Hello') - assert EN.lexicon.lookup('Hello')['id'] == addr['id'] + assert EN.lexicon.lookup('Hello')['sic'] == addr['sic'] def test_case_neq(): addr = EN.lexicon.lookup('Hello') - assert EN.lexicon.lookup('hello')['id'] != addr['id'] + assert EN.lexicon.lookup('hello')['sic'] != addr['sic'] def test_punct_neq(): addr = EN.lexicon.lookup('Hello') - assert EN.lexicon.lookup('Hello,')['id'] != addr['id'] + assert EN.lexicon.lookup('Hello,')['sic'] != addr['sic']