* Rewriting Lexeme serialization.

2025-08-09 06:34:54 +03:00 · 2014-10-29 23:19:38 +11:00 · 2014-10-29 23:19:38 +11:00 · 13909a2e24
commit 13909a2e24
parent 234d49bf4d
13 changed files with 147 additions and 79 deletions
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -13,16 +13,16 @@ import random
 from os import path
 import re

-from cymem.cymem cimport Pool
 from cython.operator cimport preincrement as preinc
 from cython.operator cimport dereference as deref
+from libc.stdio cimport fopen, fclose, fread, fwrite, FILE

+from cymem.cymem cimport Pool
 from murmurhash.mrmr cimport hash64
 from preshed.maps cimport PreshMap

 from .lexeme cimport Lexeme
-from .lexeme cimport from_dict as lexeme_from_dict
-from .lexeme cimport from_string as lexeme_from_string
+from .lexeme cimport init as lexeme_init

 from . import orth
 from . import util
@ -232,26 +232,27 @@ cdef class Lexicon:
        self.mem = Pool()
        self._dict = PreshMap(2 ** 20)
        self.strings = StringStore()
-        self.size = 0
+        self.size = 1
        cdef String string
        cdef Lexeme* lexeme
-        #for py_string, lexeme_dict in lexemes.iteritems():
-        #    string_from_unicode(&string, py_string)
-        #    lexeme = <Lexeme*>self.mem.alloc(1, sizeof(Lexeme))
-        #    lexeme_from_dict(lexeme, lexeme_dict, self.strings)
-        #    self._dict.set(string.key, lexeme)
-        #    self.lexemes.push_back(lexeme)
-        #    self.size += 1
+        for py_string, lexeme_dict in lexemes.iteritems():
+            string_from_unicode(&string, py_string)
+            lexeme = <Lexeme*>self.mem.alloc(1, sizeof(Lexeme))
+            lexeme[0] = lexeme_init(string.chars[:string.n], string.key, self.size,
+                                    self.strings, lexeme_dict)
+            self._dict.set(lexeme.hash, lexeme)
+            self.lexemes.push_back(lexeme)
+            self.size += 1

    cdef Lexeme* get(self, String* string) except NULL:
        cdef Lexeme* lex
        lex = <Lexeme*>self._dict.get(string.key)
        if lex != NULL:
            return lex
-
-        lex = <Lexeme*>self.mem.alloc(1, sizeof(Lexeme))
-        lexeme_from_string(lex, string.chars[:string.n], self.strings)
-        self._dict.set(string.key, lex)
+        lex = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
+        lex[0] = lexeme_init(string.chars[:string.n], string.key, self.size,
+                             self.strings, {})
+        self._dict.set(lex.hash, lex)
        self.lexemes.push_back(lex)
        self.size += 1
        return lex
@ -270,6 +271,34 @@ cdef class Lexicon:
        cdef Lexeme* lexeme = self.get(&string)
        return lexeme[0]

+    def dump(self, loc):
+        if path.exists(loc):
+            assert not path.isdir(loc)
+        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
+        cdef FILE* fp = fopen(<char*>bytes_loc, 'wb')
+        assert fp != NULL
+        cdef size_t st
+        for i in range(self.size):
+            st = fwrite(self.lexemes[i], sizeof(Lexeme), 1, fp)
+            assert st == 1
+        st = fclose(fp)
+        assert st == 0
+
+    def load(self, loc):
+        assert path.exists(loc)
+        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
+        cdef FILE* fp = fopen(<char*>bytes_loc, 'rb')
+        assert fp != NULL
+        cdef size_t st
+        cdef Lexeme* lexeme
+        while True:
+            lexeme = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
+            st = fread(lexeme, sizeof(lexeme), 1, fp)
+            if st == 0:
+                break
+            self.lexemes.push_back(lexeme)
+            self._dict.set(lexeme.hash, lexeme)
+        

 cdef void string_from_unicode(String* s, unicode uni):
    cdef Py_UNICODE* c_uni = <Py_UNICODE*>uni
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -23,9 +23,11 @@ cpdef enum:


 cdef struct Lexeme:
-    atom_t id
+    hash_t hash
+    atom_t i
    atom_t length
-    
+   
+    atom_t sic
    atom_t norm
    atom_t shape
    atom_t vocab10k
@ -44,12 +46,9 @@ cdef struct Lexeme:

 cdef Lexeme EMPTY_LEXEME

-
-cdef int from_string(Lexeme* lex, unicode string, StringStore store) except -1
-
-
-cdef int from_dict(Lexeme* lex, dict props, StringStore store) except -1
-
+cpdef Lexeme init(unicode string, hash_t hashed, atom_t i,
+                  StringStore store, dict props) except *
+ 

 cdef inline bint check_flag(Lexeme* lexeme, size_t flag_id) nogil:
    return lexeme.flags & (1 << flag_id)
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -1,5 +1,6 @@
 from cpython.ref cimport Py_INCREF
 from cymem.cymem cimport Pool
+from murmurhash.mrmr cimport hash64

 from libc.string cimport memset

@ -12,7 +13,7 @@ OOV_DIST_FLAGS = 0
 memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))


-def get_flags(unicode string):
+def get_flags(unicode string, float upper_pc, float title_pc, float lower_pc):
    cdef flag_t flags = 0
    flags |= orth.is_alpha(string) << IS_ALPHA
    flags |= orth.is_ascii(string) << IS_ASCII
@ -25,20 +26,36 @@ def get_flags(unicode string):
    return flags


-cdef int from_string(Lexeme* lex, unicode string, StringStore store) except -1:
+cpdef Lexeme init(unicode string, hash_t hashed, atom_t i,
+                  StringStore store, dict props) except *:
+    cdef Lexeme lex
+    lex.hash = hashed
+    lex.i = i
+    print string, i
+    lex.length = len(string)
+    lex.sic = get_string_id(string, store)
+    
+    lex.cluster = props.get('cluster', 0)
+    lex.pos = props.get('pos', 0)
+    lex.supersense = props.get('supersense', 0)
+    lex.prob = props.get('prob', 0)
+
+    cdef float upper_pc = props.get('upper_pc', 0.0)
+    cdef float lower_pc = props.get('lower_pc', 0.0)
+    cdef float title_pc = props.get('title_pc', 0.0)
+
+    lex.prefix = get_string_id(string[0], store)
+    lex.suffix = get_string_id(string[-3:], store)
+    canon_cased = orth.canon_case(string, upper_pc, title_pc, lower_pc)
+    lex.norm = get_string_id(canon_cased, store)
+    lex.shape = get_string_id(orth.word_shape(string), store)
+    lex.asciied = get_string_id(orth.asciied(string), store)
+    non_sparse = orth.non_sparse(string, lex.prob, lex.cluster, upper_pc, title_pc, lower_pc)
+    lex.vocab10k = get_string_id(non_sparse, store)
+    lex.flags = get_flags(string, upper_pc, title_pc, lower_pc)
+    return lex
+
+cdef atom_t get_string_id(unicode string, StringStore store) except 0:
    cdef bytes byte_string = string.encode('utf8')
    cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string))
-    lex.id = orig_str.i
-    lex.cluster = 0
-    lex.length = len(string)
-    lex.flags = get_flags(string)
-    # TODO: Hook this up
-    #lex.norm = norm_str.i
-    #lex.shape = norm_str.i
-    #lex.asciied = asciied_str.i
-    #lex.prefix = prefix_str.i
-    #lex.suffix = suffix_str.i
-
-
-cdef int from_dict(Lexeme* lex, dict props, StringStore stroe) except -1:
-    pass
+    return orig_str.i
--- a/spacy/orth.py
+++ b/spacy/orth.py
@ -64,11 +64,7 @@ def can_tag(name, thresh=0.5):


 # String features
-def canon_case(string, prob, cluster, case_stats, tag_stats):
-    upper_pc = case_stats.get('upper', 0.0)
-    title_pc = case_stats.get('title', 0.0)
-    lower_pc = case_stats.get('lower', 0.0)
-    
+def canon_case(string, upper_pc=0.0, title_pc=0.0, lower_pc=0.0):
    if upper_pc >= lower_pc and upper_pc >= title_pc:
        return string.upper()
    elif title_pc >= lower_pc:
@ -77,7 +73,7 @@ def canon_case(string, prob, cluster, case_stats, tag_stats):
        return string.lower()


-def word_shape(string, *args):
+def word_shape(string):
    length = len(string)
    shape = []
    last = ""
@ -103,15 +99,15 @@ def word_shape(string, *args):
    return ''.join(shape)


-def non_sparse(string, prob, cluster, case_stats, tag_stats):
+def non_sparse(string, prob, cluster, upper_pc, title_pc, lower_pc):
    if is_alpha(string):
-        return canon_case(string, prob, cluster, case_stats, tag_stats)
+        return canon_case(string, upper_pc, title_pc, lower_pc)
    elif prob >= math.log(0.0001):
        return string
    else:
-        return word_shape(string, prob, cluster, case_stats, tag_stats)
+        return word_shape(string)


-def asciied(string, prob=0, cluster=0, case_stats=None, tag_stats=None):
+def asciied(string):
    ascii_string = unidecode(string)
    return ascii_string.decode('ascii')
--- a/spacy/pos.pyx
+++ b/spacy/pos.pyx
@ -31,10 +31,15 @@ cdef class Tagger:
        self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
        self._scores = <weight_t*>self.mem.alloc(len(self.tags), sizeof(weight_t))
        self._guess = NULL_TAG
-        if path.exists(path.join(model_dir, 'model.gz')):
-            with gzip.open(path.join(model_dir, 'model.gz'), 'r') as file_:
-                self.model.load(file_)
-
+        if path.exists(path.join(model_dir, 'model')):
+            self.model.load(path.join(model_dir, 'model'))
+        tags_loc = path.join(model_dir, 'postags.json')
+        if path.exists(tags_loc):
+            with open(tags_loc) as file_:
+                Tagger.tags.update(ujson.load(file_))
+        if path.exists(path.join(model_dir, 'strings')):
+            EN.lexicon.strings.load(path.join(model_dir, 'strings'))
+            
    cpdef class_t predict(self, int i, Tokens tokens, class_t prev, class_t prev_prev) except 0:
        assert i >= 0
        get_atoms(self._atoms, tokens.lex[i-2], tokens.lex[i-1], tokens.lex[i],
@ -125,7 +130,7 @@ cdef int get_atoms(atom_t* atoms, Lexeme* p2, Lexeme* p1, Lexeme* n0, Lexeme* n1


 cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil:
-    atoms[0] = lex.id
+    atoms[0] = lex.i
    atoms[1] = lex.cluster
    atoms[2] = lex.norm
    atoms[3] = lex.shape
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -37,6 +37,7 @@ cdef class Token:
    cdef public atom_t lex_pos
    cdef public atom_t lex_supersense

+    cdef public atom_t sic
    cdef public atom_t norm
    cdef public atom_t shape
    cdef public atom_t vocab10k
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -101,16 +101,18 @@ cdef class Tokens:
@cython.freelist(64)
 cdef class Token:
    def __init__(self, StringStore string_store, int i, int idx, int pos, dict lex):
+        assert i < 1000000
        self._string_store = string_store
-        self.i = i
+        self.id = i
        self.idx = idx
        self.pos = pos
        
-        self.id = lex['id']
+        self.id = lex['i']
        self.cluster = lex['cluster']
        self.length = lex['length']
        self.lex_pos = lex['pos']
        self.lex_supersense = lex['supersense']
+        self.sic = lex['sic']
        self.norm = lex['norm']
        self.shape = lex['shape']
        self.vocab10k = lex['vocab10k']
@ -122,6 +124,6 @@ cdef class Token:

    property string:
        def __get__(self):
-            cdef bytes utf8string = self._string_store[self.id]
+            cdef bytes utf8string = self._string_store[self.sic]
            return utf8string.decode('utf8')

--- a/spacy/utf8string.pyx
+++ b/spacy/utf8string.pyx
@ -2,6 +2,8 @@ from libc.string cimport memcpy

 from murmurhash.mrmr cimport hash64

+import ujson
+

 cdef class StringStore:
    def __init__(self):
@ -51,3 +53,20 @@ cdef class StringStore:
        else:
            i = <size_t>value
        return &self.strings[i]
+
+    def dump(self, loc):
+        strings = []
+        cdef Utf8Str* string
+        cdef bytes py_string
+        for i in range(self.size):
+            string = &self.strings[i]
+            py_string = string.chars[:string.length]
+            strings.append(py_string)
+        with open(loc, 'w') as file_:
+            ujson.dump(strings, file_, ensure_ascii=False)
+
+    def load(self, loc):
+        with open(loc) as file_:
+            strings = ujson.load(file_)
+        for string in strings[1:]:
+            self.intern(string, len(string))
--- a/tests/test_canon_case.py
+++ b/tests/test_canon_case.py
@ -5,16 +5,16 @@ import py.test
 from spacy.orth import canon_case as cc

 def test_nasa():
-    assert cc('Nasa', 0.0, 0, {'upper': 0.6, 'title': 0.3, 'lower': 0.1}, {}) == 'NASA'
+    assert cc('Nasa', 0.6, 0.3, 0.1) == 'NASA'


 def test_john():
-    assert cc('john', 0.0, 0, {'title': 0.6, 'upper': 0.3, 'lower': 0.1}, {}) == 'John'
+    assert cc('john', 0.3, 0.6, 0.1) == 'John'


 def test_apple():
-    assert cc('apple', 0.0, 0, {'lower': 0.6, 'title': 0.3, 'upper': 0.1}, {}) == 'apple'
+    assert cc('apple', 0.1, 0.3, 0.6) == 'apple'


 def test_tie():
-    assert cc('I', 0.0, 0, {'lower': 0.0, 'title': 0.0, 'upper': 0.0}, {}) == 'I'
+    assert cc('I', 0.0, 0.0, 0.0) == 'I'
--- a/tests/test_contractions.py
+++ b/tests/test_contractions.py
@ -5,8 +5,8 @@ from spacy.en import EN

 def test_possess():
    tokens = EN.tokenize("Mike's")
-    assert EN.lexicon.strings[tokens[0].id] == "Mike"
-    assert EN.lexicon.strings[tokens[1].id] == "'s"
+    assert EN.lexicon.strings[tokens[0].sic] == "Mike"
+    assert EN.lexicon.strings[tokens[1].sic] == "'s"
    assert len(tokens) == 2


--- a/tests/test_non_sparse.py
+++ b/tests/test_non_sparse.py
@ -5,21 +5,21 @@ import math


 def test_common_case_upper():
-    cases = {'upper': 0.7, 'lower': 0.2, 'title': 0.1}
+    cases = {'u': 0.7, 'l': 0.2, 't': 0.1}
    prob = math.log(0.1)
-    assert non_sparse('usa', prob, 0, cases, {}) == 'USA'
+    assert non_sparse('usa', prob, 0, cases['u'], cases['t'], cases['l']) == 'USA'

 def test_same():
-    cases = {'upper': 0.01, 'title': 0.09, 'lower': 0.9}
+    cases = {'u': 0.01, 't': 0.09, 'l': 0.9}
    prob = math.log(0.5)
-    assert non_sparse('the', prob, 0, cases, {}) == 'the'
+    assert non_sparse('the', prob, 0, cases['u'], cases['t'], cases['l']) == 'the'

 def test_common_case_lower():
    prob = math.log(0.5)
-    cases = {'upper': 0.01, 'title': 0.09, 'lower': 0.9}
-    assert non_sparse('The', prob, 0, cases, {}) == 'the'
+    cases = {'u': 0.01, 't': 0.09, 'l': 0.9}
+    assert non_sparse('The', prob, 0, cases['u'], cases['t'], cases['l']) == 'the'

 def test_shape():
    prob = math.log(0.00001)
-    cases = {'upper': 0.0, 'title': 0.0, 'lower': 0.0}
-    assert non_sparse('1999', prob, 0, cases, {}) == 'dddd'
+    cases = {'u': 0.0, 't': 0.0, 'l': 0.0}
+    assert non_sparse('1999', prob, 0, cases['u'], cases['t'], cases['l']) == 'dddd'
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@ -27,17 +27,17 @@ def test_punct():
 def test_digits():
    tokens = EN.tokenize('The year: 1984.')
    assert len(tokens) == 5
-    assert tokens[0].id == EN.lexicon.lookup('The')['id']
-    assert tokens[3].id == EN.lexicon.lookup('1984')['id']
+    assert tokens[0].sic == EN.lexicon.lookup('The')['sic']
+    assert tokens[3].sic == EN.lexicon.lookup('1984')['sic']


 def test_contraction():
    tokens = EN.tokenize("don't giggle")
    assert len(tokens) == 3
-    assert tokens[1].id == EN.lexicon.lookup("not")['id']
+    assert tokens[1].sic == EN.lexicon.lookup("not")['sic']
    tokens = EN.tokenize("i said don't!")
    assert len(tokens) == 5
-    assert tokens[4].id == EN.lexicon.lookup('!')['id']
+    assert tokens[4].sic == EN.lexicon.lookup('!')['sic']


 def test_contraction_punct():
--- a/tests/test_vocab.py
+++ b/tests/test_vocab.py
@ -5,19 +5,19 @@ from spacy.en import EN

 def test_neq():
    addr = EN.lexicon.lookup('Hello')
-    assert EN.lexicon.lookup('bye')['id'] != addr['id']
+    assert EN.lexicon.lookup('bye')['sic'] != addr['sic']


 def test_eq():
    addr = EN.lexicon.lookup('Hello')
-    assert EN.lexicon.lookup('Hello')['id'] == addr['id']
+    assert EN.lexicon.lookup('Hello')['sic'] == addr['sic']


 def test_case_neq():
    addr = EN.lexicon.lookup('Hello')
-    assert EN.lexicon.lookup('hello')['id'] != addr['id']
+    assert EN.lexicon.lookup('hello')['sic'] != addr['sic']


 def test_punct_neq():
    addr = EN.lexicon.lookup('Hello')
-    assert EN.lexicon.lookup('Hello,')['id'] != addr['id']
+    assert EN.lexicon.lookup('Hello,')['sic'] != addr['sic']