mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
* Rewriting Lexeme serialization.
This commit is contained in:
parent
234d49bf4d
commit
13909a2e24
|
@ -13,16 +13,16 @@ import random
|
|||
from os import path
|
||||
import re
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
from cython.operator cimport preincrement as preinc
|
||||
from cython.operator cimport dereference as deref
|
||||
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
from murmurhash.mrmr cimport hash64
|
||||
from preshed.maps cimport PreshMap
|
||||
|
||||
from .lexeme cimport Lexeme
|
||||
from .lexeme cimport from_dict as lexeme_from_dict
|
||||
from .lexeme cimport from_string as lexeme_from_string
|
||||
from .lexeme cimport init as lexeme_init
|
||||
|
||||
from . import orth
|
||||
from . import util
|
||||
|
@ -232,26 +232,27 @@ cdef class Lexicon:
|
|||
self.mem = Pool()
|
||||
self._dict = PreshMap(2 ** 20)
|
||||
self.strings = StringStore()
|
||||
self.size = 0
|
||||
self.size = 1
|
||||
cdef String string
|
||||
cdef Lexeme* lexeme
|
||||
#for py_string, lexeme_dict in lexemes.iteritems():
|
||||
# string_from_unicode(&string, py_string)
|
||||
# lexeme = <Lexeme*>self.mem.alloc(1, sizeof(Lexeme))
|
||||
# lexeme_from_dict(lexeme, lexeme_dict, self.strings)
|
||||
# self._dict.set(string.key, lexeme)
|
||||
# self.lexemes.push_back(lexeme)
|
||||
# self.size += 1
|
||||
for py_string, lexeme_dict in lexemes.iteritems():
|
||||
string_from_unicode(&string, py_string)
|
||||
lexeme = <Lexeme*>self.mem.alloc(1, sizeof(Lexeme))
|
||||
lexeme[0] = lexeme_init(string.chars[:string.n], string.key, self.size,
|
||||
self.strings, lexeme_dict)
|
||||
self._dict.set(lexeme.hash, lexeme)
|
||||
self.lexemes.push_back(lexeme)
|
||||
self.size += 1
|
||||
|
||||
cdef Lexeme* get(self, String* string) except NULL:
|
||||
cdef Lexeme* lex
|
||||
lex = <Lexeme*>self._dict.get(string.key)
|
||||
if lex != NULL:
|
||||
return lex
|
||||
|
||||
lex = <Lexeme*>self.mem.alloc(1, sizeof(Lexeme))
|
||||
lexeme_from_string(lex, string.chars[:string.n], self.strings)
|
||||
self._dict.set(string.key, lex)
|
||||
lex = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
|
||||
lex[0] = lexeme_init(string.chars[:string.n], string.key, self.size,
|
||||
self.strings, {})
|
||||
self._dict.set(lex.hash, lex)
|
||||
self.lexemes.push_back(lex)
|
||||
self.size += 1
|
||||
return lex
|
||||
|
@ -270,6 +271,34 @@ cdef class Lexicon:
|
|||
cdef Lexeme* lexeme = self.get(&string)
|
||||
return lexeme[0]
|
||||
|
||||
def dump(self, loc):
|
||||
if path.exists(loc):
|
||||
assert not path.isdir(loc)
|
||||
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
||||
cdef FILE* fp = fopen(<char*>bytes_loc, 'wb')
|
||||
assert fp != NULL
|
||||
cdef size_t st
|
||||
for i in range(self.size):
|
||||
st = fwrite(self.lexemes[i], sizeof(Lexeme), 1, fp)
|
||||
assert st == 1
|
||||
st = fclose(fp)
|
||||
assert st == 0
|
||||
|
||||
def load(self, loc):
|
||||
assert path.exists(loc)
|
||||
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
||||
cdef FILE* fp = fopen(<char*>bytes_loc, 'rb')
|
||||
assert fp != NULL
|
||||
cdef size_t st
|
||||
cdef Lexeme* lexeme
|
||||
while True:
|
||||
lexeme = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
|
||||
st = fread(lexeme, sizeof(lexeme), 1, fp)
|
||||
if st == 0:
|
||||
break
|
||||
self.lexemes.push_back(lexeme)
|
||||
self._dict.set(lexeme.hash, lexeme)
|
||||
|
||||
|
||||
cdef void string_from_unicode(String* s, unicode uni):
|
||||
cdef Py_UNICODE* c_uni = <Py_UNICODE*>uni
|
||||
|
|
|
@ -23,9 +23,11 @@ cpdef enum:
|
|||
|
||||
|
||||
cdef struct Lexeme:
|
||||
atom_t id
|
||||
hash_t hash
|
||||
atom_t i
|
||||
atom_t length
|
||||
|
||||
|
||||
atom_t sic
|
||||
atom_t norm
|
||||
atom_t shape
|
||||
atom_t vocab10k
|
||||
|
@ -44,12 +46,9 @@ cdef struct Lexeme:
|
|||
|
||||
cdef Lexeme EMPTY_LEXEME
|
||||
|
||||
|
||||
cdef int from_string(Lexeme* lex, unicode string, StringStore store) except -1
|
||||
|
||||
|
||||
cdef int from_dict(Lexeme* lex, dict props, StringStore store) except -1
|
||||
|
||||
cpdef Lexeme init(unicode string, hash_t hashed, atom_t i,
|
||||
StringStore store, dict props) except *
|
||||
|
||||
|
||||
cdef inline bint check_flag(Lexeme* lexeme, size_t flag_id) nogil:
|
||||
return lexeme.flags & (1 << flag_id)
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
from cpython.ref cimport Py_INCREF
|
||||
from cymem.cymem cimport Pool
|
||||
from murmurhash.mrmr cimport hash64
|
||||
|
||||
from libc.string cimport memset
|
||||
|
||||
|
@ -12,7 +13,7 @@ OOV_DIST_FLAGS = 0
|
|||
memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))
|
||||
|
||||
|
||||
def get_flags(unicode string):
|
||||
def get_flags(unicode string, float upper_pc, float title_pc, float lower_pc):
|
||||
cdef flag_t flags = 0
|
||||
flags |= orth.is_alpha(string) << IS_ALPHA
|
||||
flags |= orth.is_ascii(string) << IS_ASCII
|
||||
|
@ -25,20 +26,36 @@ def get_flags(unicode string):
|
|||
return flags
|
||||
|
||||
|
||||
cdef int from_string(Lexeme* lex, unicode string, StringStore store) except -1:
|
||||
cpdef Lexeme init(unicode string, hash_t hashed, atom_t i,
|
||||
StringStore store, dict props) except *:
|
||||
cdef Lexeme lex
|
||||
lex.hash = hashed
|
||||
lex.i = i
|
||||
print string, i
|
||||
lex.length = len(string)
|
||||
lex.sic = get_string_id(string, store)
|
||||
|
||||
lex.cluster = props.get('cluster', 0)
|
||||
lex.pos = props.get('pos', 0)
|
||||
lex.supersense = props.get('supersense', 0)
|
||||
lex.prob = props.get('prob', 0)
|
||||
|
||||
cdef float upper_pc = props.get('upper_pc', 0.0)
|
||||
cdef float lower_pc = props.get('lower_pc', 0.0)
|
||||
cdef float title_pc = props.get('title_pc', 0.0)
|
||||
|
||||
lex.prefix = get_string_id(string[0], store)
|
||||
lex.suffix = get_string_id(string[-3:], store)
|
||||
canon_cased = orth.canon_case(string, upper_pc, title_pc, lower_pc)
|
||||
lex.norm = get_string_id(canon_cased, store)
|
||||
lex.shape = get_string_id(orth.word_shape(string), store)
|
||||
lex.asciied = get_string_id(orth.asciied(string), store)
|
||||
non_sparse = orth.non_sparse(string, lex.prob, lex.cluster, upper_pc, title_pc, lower_pc)
|
||||
lex.vocab10k = get_string_id(non_sparse, store)
|
||||
lex.flags = get_flags(string, upper_pc, title_pc, lower_pc)
|
||||
return lex
|
||||
|
||||
cdef atom_t get_string_id(unicode string, StringStore store) except 0:
|
||||
cdef bytes byte_string = string.encode('utf8')
|
||||
cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string))
|
||||
lex.id = orig_str.i
|
||||
lex.cluster = 0
|
||||
lex.length = len(string)
|
||||
lex.flags = get_flags(string)
|
||||
# TODO: Hook this up
|
||||
#lex.norm = norm_str.i
|
||||
#lex.shape = norm_str.i
|
||||
#lex.asciied = asciied_str.i
|
||||
#lex.prefix = prefix_str.i
|
||||
#lex.suffix = suffix_str.i
|
||||
|
||||
|
||||
cdef int from_dict(Lexeme* lex, dict props, StringStore stroe) except -1:
|
||||
pass
|
||||
return orig_str.i
|
||||
|
|
|
@ -64,11 +64,7 @@ def can_tag(name, thresh=0.5):
|
|||
|
||||
|
||||
# String features
|
||||
def canon_case(string, prob, cluster, case_stats, tag_stats):
|
||||
upper_pc = case_stats.get('upper', 0.0)
|
||||
title_pc = case_stats.get('title', 0.0)
|
||||
lower_pc = case_stats.get('lower', 0.0)
|
||||
|
||||
def canon_case(string, upper_pc=0.0, title_pc=0.0, lower_pc=0.0):
|
||||
if upper_pc >= lower_pc and upper_pc >= title_pc:
|
||||
return string.upper()
|
||||
elif title_pc >= lower_pc:
|
||||
|
@ -77,7 +73,7 @@ def canon_case(string, prob, cluster, case_stats, tag_stats):
|
|||
return string.lower()
|
||||
|
||||
|
||||
def word_shape(string, *args):
|
||||
def word_shape(string):
|
||||
length = len(string)
|
||||
shape = []
|
||||
last = ""
|
||||
|
@ -103,15 +99,15 @@ def word_shape(string, *args):
|
|||
return ''.join(shape)
|
||||
|
||||
|
||||
def non_sparse(string, prob, cluster, case_stats, tag_stats):
|
||||
def non_sparse(string, prob, cluster, upper_pc, title_pc, lower_pc):
|
||||
if is_alpha(string):
|
||||
return canon_case(string, prob, cluster, case_stats, tag_stats)
|
||||
return canon_case(string, upper_pc, title_pc, lower_pc)
|
||||
elif prob >= math.log(0.0001):
|
||||
return string
|
||||
else:
|
||||
return word_shape(string, prob, cluster, case_stats, tag_stats)
|
||||
return word_shape(string)
|
||||
|
||||
|
||||
def asciied(string, prob=0, cluster=0, case_stats=None, tag_stats=None):
|
||||
def asciied(string):
|
||||
ascii_string = unidecode(string)
|
||||
return ascii_string.decode('ascii')
|
||||
|
|
|
@ -31,10 +31,15 @@ cdef class Tagger:
|
|||
self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
|
||||
self._scores = <weight_t*>self.mem.alloc(len(self.tags), sizeof(weight_t))
|
||||
self._guess = NULL_TAG
|
||||
if path.exists(path.join(model_dir, 'model.gz')):
|
||||
with gzip.open(path.join(model_dir, 'model.gz'), 'r') as file_:
|
||||
self.model.load(file_)
|
||||
|
||||
if path.exists(path.join(model_dir, 'model')):
|
||||
self.model.load(path.join(model_dir, 'model'))
|
||||
tags_loc = path.join(model_dir, 'postags.json')
|
||||
if path.exists(tags_loc):
|
||||
with open(tags_loc) as file_:
|
||||
Tagger.tags.update(ujson.load(file_))
|
||||
if path.exists(path.join(model_dir, 'strings')):
|
||||
EN.lexicon.strings.load(path.join(model_dir, 'strings'))
|
||||
|
||||
cpdef class_t predict(self, int i, Tokens tokens, class_t prev, class_t prev_prev) except 0:
|
||||
assert i >= 0
|
||||
get_atoms(self._atoms, tokens.lex[i-2], tokens.lex[i-1], tokens.lex[i],
|
||||
|
@ -125,7 +130,7 @@ cdef int get_atoms(atom_t* atoms, Lexeme* p2, Lexeme* p1, Lexeme* n0, Lexeme* n1
|
|||
|
||||
|
||||
cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil:
|
||||
atoms[0] = lex.id
|
||||
atoms[0] = lex.i
|
||||
atoms[1] = lex.cluster
|
||||
atoms[2] = lex.norm
|
||||
atoms[3] = lex.shape
|
||||
|
|
|
@ -37,6 +37,7 @@ cdef class Token:
|
|||
cdef public atom_t lex_pos
|
||||
cdef public atom_t lex_supersense
|
||||
|
||||
cdef public atom_t sic
|
||||
cdef public atom_t norm
|
||||
cdef public atom_t shape
|
||||
cdef public atom_t vocab10k
|
||||
|
|
|
@ -101,16 +101,18 @@ cdef class Tokens:
|
|||
@cython.freelist(64)
|
||||
cdef class Token:
|
||||
def __init__(self, StringStore string_store, int i, int idx, int pos, dict lex):
|
||||
assert i < 1000000
|
||||
self._string_store = string_store
|
||||
self.i = i
|
||||
self.id = i
|
||||
self.idx = idx
|
||||
self.pos = pos
|
||||
|
||||
self.id = lex['id']
|
||||
self.id = lex['i']
|
||||
self.cluster = lex['cluster']
|
||||
self.length = lex['length']
|
||||
self.lex_pos = lex['pos']
|
||||
self.lex_supersense = lex['supersense']
|
||||
self.sic = lex['sic']
|
||||
self.norm = lex['norm']
|
||||
self.shape = lex['shape']
|
||||
self.vocab10k = lex['vocab10k']
|
||||
|
@ -122,6 +124,6 @@ cdef class Token:
|
|||
|
||||
property string:
|
||||
def __get__(self):
|
||||
cdef bytes utf8string = self._string_store[self.id]
|
||||
cdef bytes utf8string = self._string_store[self.sic]
|
||||
return utf8string.decode('utf8')
|
||||
|
||||
|
|
|
@ -2,6 +2,8 @@ from libc.string cimport memcpy
|
|||
|
||||
from murmurhash.mrmr cimport hash64
|
||||
|
||||
import ujson
|
||||
|
||||
|
||||
cdef class StringStore:
|
||||
def __init__(self):
|
||||
|
@ -51,3 +53,20 @@ cdef class StringStore:
|
|||
else:
|
||||
i = <size_t>value
|
||||
return &self.strings[i]
|
||||
|
||||
def dump(self, loc):
|
||||
strings = []
|
||||
cdef Utf8Str* string
|
||||
cdef bytes py_string
|
||||
for i in range(self.size):
|
||||
string = &self.strings[i]
|
||||
py_string = string.chars[:string.length]
|
||||
strings.append(py_string)
|
||||
with open(loc, 'w') as file_:
|
||||
ujson.dump(strings, file_, ensure_ascii=False)
|
||||
|
||||
def load(self, loc):
|
||||
with open(loc) as file_:
|
||||
strings = ujson.load(file_)
|
||||
for string in strings[1:]:
|
||||
self.intern(string, len(string))
|
||||
|
|
|
@ -5,16 +5,16 @@ import py.test
|
|||
from spacy.orth import canon_case as cc
|
||||
|
||||
def test_nasa():
|
||||
assert cc('Nasa', 0.0, 0, {'upper': 0.6, 'title': 0.3, 'lower': 0.1}, {}) == 'NASA'
|
||||
assert cc('Nasa', 0.6, 0.3, 0.1) == 'NASA'
|
||||
|
||||
|
||||
def test_john():
|
||||
assert cc('john', 0.0, 0, {'title': 0.6, 'upper': 0.3, 'lower': 0.1}, {}) == 'John'
|
||||
assert cc('john', 0.3, 0.6, 0.1) == 'John'
|
||||
|
||||
|
||||
def test_apple():
|
||||
assert cc('apple', 0.0, 0, {'lower': 0.6, 'title': 0.3, 'upper': 0.1}, {}) == 'apple'
|
||||
assert cc('apple', 0.1, 0.3, 0.6) == 'apple'
|
||||
|
||||
|
||||
def test_tie():
|
||||
assert cc('I', 0.0, 0, {'lower': 0.0, 'title': 0.0, 'upper': 0.0}, {}) == 'I'
|
||||
assert cc('I', 0.0, 0.0, 0.0) == 'I'
|
||||
|
|
|
@ -5,8 +5,8 @@ from spacy.en import EN
|
|||
|
||||
def test_possess():
|
||||
tokens = EN.tokenize("Mike's")
|
||||
assert EN.lexicon.strings[tokens[0].id] == "Mike"
|
||||
assert EN.lexicon.strings[tokens[1].id] == "'s"
|
||||
assert EN.lexicon.strings[tokens[0].sic] == "Mike"
|
||||
assert EN.lexicon.strings[tokens[1].sic] == "'s"
|
||||
assert len(tokens) == 2
|
||||
|
||||
|
||||
|
|
|
@ -5,21 +5,21 @@ import math
|
|||
|
||||
|
||||
def test_common_case_upper():
|
||||
cases = {'upper': 0.7, 'lower': 0.2, 'title': 0.1}
|
||||
cases = {'u': 0.7, 'l': 0.2, 't': 0.1}
|
||||
prob = math.log(0.1)
|
||||
assert non_sparse('usa', prob, 0, cases, {}) == 'USA'
|
||||
assert non_sparse('usa', prob, 0, cases['u'], cases['t'], cases['l']) == 'USA'
|
||||
|
||||
def test_same():
|
||||
cases = {'upper': 0.01, 'title': 0.09, 'lower': 0.9}
|
||||
cases = {'u': 0.01, 't': 0.09, 'l': 0.9}
|
||||
prob = math.log(0.5)
|
||||
assert non_sparse('the', prob, 0, cases, {}) == 'the'
|
||||
assert non_sparse('the', prob, 0, cases['u'], cases['t'], cases['l']) == 'the'
|
||||
|
||||
def test_common_case_lower():
|
||||
prob = math.log(0.5)
|
||||
cases = {'upper': 0.01, 'title': 0.09, 'lower': 0.9}
|
||||
assert non_sparse('The', prob, 0, cases, {}) == 'the'
|
||||
cases = {'u': 0.01, 't': 0.09, 'l': 0.9}
|
||||
assert non_sparse('The', prob, 0, cases['u'], cases['t'], cases['l']) == 'the'
|
||||
|
||||
def test_shape():
|
||||
prob = math.log(0.00001)
|
||||
cases = {'upper': 0.0, 'title': 0.0, 'lower': 0.0}
|
||||
assert non_sparse('1999', prob, 0, cases, {}) == 'dddd'
|
||||
cases = {'u': 0.0, 't': 0.0, 'l': 0.0}
|
||||
assert non_sparse('1999', prob, 0, cases['u'], cases['t'], cases['l']) == 'dddd'
|
||||
|
|
|
@ -27,17 +27,17 @@ def test_punct():
|
|||
def test_digits():
|
||||
tokens = EN.tokenize('The year: 1984.')
|
||||
assert len(tokens) == 5
|
||||
assert tokens[0].id == EN.lexicon.lookup('The')['id']
|
||||
assert tokens[3].id == EN.lexicon.lookup('1984')['id']
|
||||
assert tokens[0].sic == EN.lexicon.lookup('The')['sic']
|
||||
assert tokens[3].sic == EN.lexicon.lookup('1984')['sic']
|
||||
|
||||
|
||||
def test_contraction():
|
||||
tokens = EN.tokenize("don't giggle")
|
||||
assert len(tokens) == 3
|
||||
assert tokens[1].id == EN.lexicon.lookup("not")['id']
|
||||
assert tokens[1].sic == EN.lexicon.lookup("not")['sic']
|
||||
tokens = EN.tokenize("i said don't!")
|
||||
assert len(tokens) == 5
|
||||
assert tokens[4].id == EN.lexicon.lookup('!')['id']
|
||||
assert tokens[4].sic == EN.lexicon.lookup('!')['sic']
|
||||
|
||||
|
||||
def test_contraction_punct():
|
||||
|
|
|
@ -5,19 +5,19 @@ from spacy.en import EN
|
|||
|
||||
def test_neq():
|
||||
addr = EN.lexicon.lookup('Hello')
|
||||
assert EN.lexicon.lookup('bye')['id'] != addr['id']
|
||||
assert EN.lexicon.lookup('bye')['sic'] != addr['sic']
|
||||
|
||||
|
||||
def test_eq():
|
||||
addr = EN.lexicon.lookup('Hello')
|
||||
assert EN.lexicon.lookup('Hello')['id'] == addr['id']
|
||||
assert EN.lexicon.lookup('Hello')['sic'] == addr['sic']
|
||||
|
||||
|
||||
def test_case_neq():
|
||||
addr = EN.lexicon.lookup('Hello')
|
||||
assert EN.lexicon.lookup('hello')['id'] != addr['id']
|
||||
assert EN.lexicon.lookup('hello')['sic'] != addr['sic']
|
||||
|
||||
|
||||
def test_punct_neq():
|
||||
addr = EN.lexicon.lookup('Hello')
|
||||
assert EN.lexicon.lookup('Hello,')['id'] != addr['id']
|
||||
assert EN.lexicon.lookup('Hello,')['sic'] != addr['sic']
|
||||
|
|
Loading…
Reference in New Issue
Block a user