* Tighten interfaces

This commit is contained in:
Matthew Honnibal 2014-10-30 18:14:42 +11:00
parent ea85bf3a0a
commit ea8f1e7053
4 changed files with 12 additions and 49 deletions

View File

@ -50,4 +50,4 @@ cdef class English(Language):
pass pass
EN = English('en', [], []) EN = English('en')

View File

@ -27,7 +27,6 @@ cdef class Lexicon:
cpdef readonly size_t size cpdef readonly size_t size
cpdef readonly StringStore strings cpdef readonly StringStore strings
cpdef Lexeme lookup(self, unicode string)
cdef Lexeme* get(self, String* s) except NULL cdef Lexeme* get(self, String* s) except NULL
cdef PreshMap _dict cdef PreshMap _dict

View File

@ -1,11 +1,5 @@
# cython: profile=True # cython: profile=True
# cython: embedsignature=True # cython: embedsignature=True
"""Common classes and utilities across languages.
Provides the main implementation for the spacy tokenizer. Specific languages
subclass the Language class, over-writing the tokenization rules as necessary.
Special-case tokenization rules are read from data/<lang>/tokenization .
"""
from __future__ import unicode_literals from __future__ import unicode_literals
import json import json
@ -24,27 +18,22 @@ from preshed.maps cimport PreshMap
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
from .lexeme cimport init as lexeme_init from .lexeme cimport init as lexeme_init
from . import orth
from . import util from . import util
from .util import read_lang_data from .util import read_lang_data
from .tokens import Tokens from .tokens import Tokens
cdef class Language: cdef class Language:
"""Base class for language-specific tokenizers.
The language's name is used to look up default data-files, found in data/<name.
"""
def __init__(self, name): def __init__(self, name):
self.name = name self.name = name
self.mem = Pool() self.mem = Pool()
self._cache = PreshMap(2 ** 25) self._cache = PreshMap(2 ** 25)
self._specials = PreshMap(2 ** 16) self._specials = PreshMap(2 ** 16)
rules, prefix, suffix, infix, lexemes = util.read_lang_data(name) rules, prefix, suffix, infix = util.read_lang_data(name)
self._prefix_re = re.compile(prefix) self._prefix_re = re.compile(prefix)
self._suffix_re = re.compile(suffix) self._suffix_re = re.compile(suffix)
self._infix_re = re.compile(infix) self._infix_re = re.compile(infix)
self.lexicon = Lexicon(lexemes) self.lexicon = Lexicon()
if path.exists(path.join(util.DATA_DIR, name, 'lexemes')): if path.exists(path.join(util.DATA_DIR, name, 'lexemes')):
self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes')) self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes'))
self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings')) self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
@ -231,26 +220,11 @@ cdef class Language:
cdef class Lexicon: cdef class Lexicon:
def __init__(self, lexemes): def __init__(self):
self.mem = Pool() self.mem = Pool()
self._dict = PreshMap(2 ** 20) self._dict = PreshMap(2 ** 20)
self.strings = StringStore() self.strings = StringStore()
self.size = 1 self.size = 1
cdef String string
cdef Lexeme* lexeme
for py_string, lexeme_dict in lexemes.iteritems():
string_from_unicode(&string, py_string)
lexeme = <Lexeme*>self.mem.alloc(1, sizeof(Lexeme))
lexeme[0] = lexeme_init(string.chars[:string.n], string.key, self.strings,
lexeme_dict)
self._dict.set(string.key, lexeme)
self.size += 1
def set(self, unicode py_string, dict lexeme_dict):
cdef String string
string_from_unicode(&string, py_string)
cdef Lexeme* lex = self.get(&string)
lex[0] = lexeme_init(string.chars[:string.n], string.key, self.strings, lexeme_dict)
cdef Lexeme* get(self, String* string) except NULL: cdef Lexeme* get(self, String* string) except NULL:
cdef Lexeme* lex cdef Lexeme* lex
@ -263,20 +237,18 @@ cdef class Lexicon:
self.size += 1 self.size += 1
return lex return lex
cpdef Lexeme lookup(self, unicode uni_string): def __getitem__(self, unicode uni_string):
"""Retrieve (or create, if not found) a Lexeme for a string, and return it.
Args
string (unicode): The string to be looked up. Must be unicode, not bytes.
Returns:
lexeme (Lexeme): A reference to a lexical type.
"""
cdef String string cdef String string
string_from_unicode(&string, uni_string) string_from_unicode(&string, uni_string)
cdef Lexeme* lexeme = self.get(&string) cdef Lexeme* lexeme = self.get(&string)
return lexeme[0] return lexeme[0]
def __setitem__(self, unicode uni_string, dict props):
cdef String s
string_from_unicode(&s, uni_string)
cdef Lexeme* lex = self.get(&s)
lex[0] = lexeme_init(s.chars[:s.n], s.key, self.strings, props)
def dump(self, loc): def dump(self, loc):
if path.exists(loc): if path.exists(loc):
assert not path.isdir(loc) assert not path.isdir(loc)
@ -316,7 +288,6 @@ cdef class Lexicon:
break break
self._dict.set(key, lexeme) self._dict.set(key, lexeme)
i += 1 i += 1
print "Load %d lexemes" % i
fclose(fp) fclose(fp)

View File

@ -17,14 +17,7 @@ def read_lang_data(name):
prefix = read_prefix(data_dir) prefix = read_prefix(data_dir)
suffix = read_suffix(data_dir) suffix = read_suffix(data_dir)
infix = read_infix(data_dir) infix = read_infix(data_dir)
return tokenization, prefix, suffix, infix
lex_loc = path.join(data_dir, 'lexemes.json')
if path.exists(lex_loc):
with open(lex_loc) as file_:
lexemes = ujson.load(file_)
else:
lexemes = {}
return tokenization, prefix, suffix, infix, lexemes
def read_prefix(data_dir): def read_prefix(data_dir):