* Tighten interfaces

This commit is contained in:
Matthew Honnibal 2014-10-30 18:14:42 +11:00
parent ea85bf3a0a
commit ea8f1e7053
4 changed files with 12 additions and 49 deletions

View File

@ -50,4 +50,4 @@ cdef class English(Language):
pass
EN = English('en', [], [])
EN = English('en')

View File

@ -27,7 +27,6 @@ cdef class Lexicon:
cpdef readonly size_t size
cpdef readonly StringStore strings
cpdef Lexeme lookup(self, unicode string)
cdef Lexeme* get(self, String* s) except NULL
cdef PreshMap _dict

View File

@ -1,11 +1,5 @@
# cython: profile=True
# cython: embedsignature=True
"""Common classes and utilities across languages.
Provides the main implementation for the spacy tokenizer. Specific languages
subclass the Language class, over-writing the tokenization rules as necessary.
Special-case tokenization rules are read from data/<lang>/tokenization .
"""
from __future__ import unicode_literals
import json
@ -24,27 +18,22 @@ from preshed.maps cimport PreshMap
from .lexeme cimport Lexeme
from .lexeme cimport init as lexeme_init
from . import orth
from . import util
from .util import read_lang_data
from .tokens import Tokens
cdef class Language:
"""Base class for language-specific tokenizers.
The language's name is used to look up default data-files, found in data/<name.
"""
def __init__(self, name):
self.name = name
self.mem = Pool()
self._cache = PreshMap(2 ** 25)
self._specials = PreshMap(2 ** 16)
rules, prefix, suffix, infix, lexemes = util.read_lang_data(name)
rules, prefix, suffix, infix = util.read_lang_data(name)
self._prefix_re = re.compile(prefix)
self._suffix_re = re.compile(suffix)
self._infix_re = re.compile(infix)
self.lexicon = Lexicon(lexemes)
self.lexicon = Lexicon()
if path.exists(path.join(util.DATA_DIR, name, 'lexemes')):
self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes'))
self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
@ -231,26 +220,11 @@ cdef class Language:
cdef class Lexicon:
def __init__(self, lexemes):
def __init__(self):
self.mem = Pool()
self._dict = PreshMap(2 ** 20)
self.strings = StringStore()
self.size = 1
cdef String string
cdef Lexeme* lexeme
for py_string, lexeme_dict in lexemes.iteritems():
string_from_unicode(&string, py_string)
lexeme = <Lexeme*>self.mem.alloc(1, sizeof(Lexeme))
lexeme[0] = lexeme_init(string.chars[:string.n], string.key, self.strings,
lexeme_dict)
self._dict.set(string.key, lexeme)
self.size += 1
def set(self, unicode py_string, dict lexeme_dict):
cdef String string
string_from_unicode(&string, py_string)
cdef Lexeme* lex = self.get(&string)
lex[0] = lexeme_init(string.chars[:string.n], string.key, self.strings, lexeme_dict)
cdef Lexeme* get(self, String* string) except NULL:
cdef Lexeme* lex
@ -263,20 +237,18 @@ cdef class Lexicon:
self.size += 1
return lex
cpdef Lexeme lookup(self, unicode uni_string):
"""Retrieve (or create, if not found) a Lexeme for a string, and return it.
Args
string (unicode): The string to be looked up. Must be unicode, not bytes.
Returns:
lexeme (Lexeme): A reference to a lexical type.
"""
def __getitem__(self, unicode uni_string):
cdef String string
string_from_unicode(&string, uni_string)
cdef Lexeme* lexeme = self.get(&string)
return lexeme[0]
def __setitem__(self, unicode uni_string, dict props):
cdef String s
string_from_unicode(&s, uni_string)
cdef Lexeme* lex = self.get(&s)
lex[0] = lexeme_init(s.chars[:s.n], s.key, self.strings, props)
def dump(self, loc):
if path.exists(loc):
assert not path.isdir(loc)
@ -316,7 +288,6 @@ cdef class Lexicon:
break
self._dict.set(key, lexeme)
i += 1
print "Load %d lexemes" % i
fclose(fp)

View File

@ -17,14 +17,7 @@ def read_lang_data(name):
prefix = read_prefix(data_dir)
suffix = read_suffix(data_dir)
infix = read_infix(data_dir)
lex_loc = path.join(data_dir, 'lexemes.json')
if path.exists(lex_loc):
with open(lex_loc) as file_:
lexemes = ujson.load(file_)
else:
lexemes = {}
return tokenization, prefix, suffix, infix, lexemes
return tokenization, prefix, suffix, infix
def read_prefix(data_dir):