From ea8f1e70536bcd48fb1360343acacc1f757061ba Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 30 Oct 2014 18:14:42 +1100 Subject: [PATCH] * Tighten interfaces --- spacy/en.pyx | 2 +- spacy/lang.pxd | 1 - spacy/lang.pyx | 49 ++++++++++--------------------------------------- spacy/util.py | 9 +-------- 4 files changed, 12 insertions(+), 49 deletions(-) diff --git a/spacy/en.pyx b/spacy/en.pyx index f29e45c9c..95c1cbd94 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -50,4 +50,4 @@ cdef class English(Language): pass -EN = English('en', [], []) +EN = English('en') diff --git a/spacy/lang.pxd b/spacy/lang.pxd index 9d6419557..4234b04b3 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -27,7 +27,6 @@ cdef class Lexicon: cpdef readonly size_t size cpdef readonly StringStore strings - cpdef Lexeme lookup(self, unicode string) cdef Lexeme* get(self, String* s) except NULL cdef PreshMap _dict diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 114c10c66..98205b354 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -1,11 +1,5 @@ # cython: profile=True # cython: embedsignature=True -"""Common classes and utilities across languages. - -Provides the main implementation for the spacy tokenizer. Specific languages -subclass the Language class, over-writing the tokenization rules as necessary. -Special-case tokenization rules are read from data//tokenization . -""" from __future__ import unicode_literals import json @@ -24,27 +18,22 @@ from preshed.maps cimport PreshMap from .lexeme cimport Lexeme from .lexeme cimport init as lexeme_init -from . import orth from . import util from .util import read_lang_data from .tokens import Tokens cdef class Language: - """Base class for language-specific tokenizers. - - The language's name is used to look up default data-files, found in data/self.mem.alloc(1, sizeof(Lexeme)) - lexeme[0] = lexeme_init(string.chars[:string.n], string.key, self.strings, - lexeme_dict) - self._dict.set(string.key, lexeme) - self.size += 1 - - def set(self, unicode py_string, dict lexeme_dict): - cdef String string - string_from_unicode(&string, py_string) - cdef Lexeme* lex = self.get(&string) - lex[0] = lexeme_init(string.chars[:string.n], string.key, self.strings, lexeme_dict) cdef Lexeme* get(self, String* string) except NULL: cdef Lexeme* lex @@ -263,20 +237,18 @@ cdef class Lexicon: self.size += 1 return lex - cpdef Lexeme lookup(self, unicode uni_string): - """Retrieve (or create, if not found) a Lexeme for a string, and return it. - - Args - string (unicode): The string to be looked up. Must be unicode, not bytes. - - Returns: - lexeme (Lexeme): A reference to a lexical type. - """ + def __getitem__(self, unicode uni_string): cdef String string string_from_unicode(&string, uni_string) cdef Lexeme* lexeme = self.get(&string) return lexeme[0] + def __setitem__(self, unicode uni_string, dict props): + cdef String s + string_from_unicode(&s, uni_string) + cdef Lexeme* lex = self.get(&s) + lex[0] = lexeme_init(s.chars[:s.n], s.key, self.strings, props) + def dump(self, loc): if path.exists(loc): assert not path.isdir(loc) @@ -316,7 +288,6 @@ cdef class Lexicon: break self._dict.set(key, lexeme) i += 1 - print "Load %d lexemes" % i fclose(fp) diff --git a/spacy/util.py b/spacy/util.py index d06911400..5062ca6db 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -17,14 +17,7 @@ def read_lang_data(name): prefix = read_prefix(data_dir) suffix = read_suffix(data_dir) infix = read_infix(data_dir) - - lex_loc = path.join(data_dir, 'lexemes.json') - if path.exists(lex_loc): - with open(lex_loc) as file_: - lexemes = ujson.load(file_) - else: - lexemes = {} - return tokenization, prefix, suffix, infix, lexemes + return tokenization, prefix, suffix, infix def read_prefix(data_dir):