* Tighten interfaces

2025-09-17 01:22:37 +03:00 · 2014-10-30 18:14:42 +11:00 · 2014-10-30 18:14:42 +11:00 · ea8f1e7053
commit ea8f1e7053
parent ea85bf3a0a
4 changed files with 12 additions and 49 deletions
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -50,4 +50,4 @@ cdef class English(Language):
    pass


-EN = English('en', [], [])
+EN = English('en')
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -27,7 +27,6 @@ cdef class Lexicon:
    cpdef readonly size_t size
    cpdef readonly StringStore strings

-    cpdef Lexeme lookup(self, unicode string)
    cdef Lexeme* get(self, String* s) except NULL
    
    cdef PreshMap _dict
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -1,11 +1,5 @@
 # cython: profile=True
 # cython: embedsignature=True
-"""Common classes and utilities across languages.
-
-Provides the main implementation for the spacy tokenizer. Specific languages
-subclass the Language class, over-writing the tokenization rules as necessary.
-Special-case tokenization rules are read from data/<lang>/tokenization .
-"""
 from __future__ import unicode_literals

 import json
@ -24,27 +18,22 @@ from preshed.maps cimport PreshMap
 from .lexeme cimport Lexeme
 from .lexeme cimport init as lexeme_init

-from . import orth
 from . import util
 from .util import read_lang_data
 from .tokens import Tokens


 cdef class Language:
-    """Base class for language-specific tokenizers.
-
-    The language's name is used to look up default data-files, found in data/<name.
-    """
    def __init__(self, name):
        self.name = name
        self.mem = Pool()
        self._cache = PreshMap(2 ** 25)
        self._specials = PreshMap(2 ** 16)
-        rules, prefix, suffix, infix, lexemes = util.read_lang_data(name)
+        rules, prefix, suffix, infix = util.read_lang_data(name)
        self._prefix_re = re.compile(prefix)
        self._suffix_re = re.compile(suffix)
        self._infix_re = re.compile(infix)
-        self.lexicon = Lexicon(lexemes)
+        self.lexicon = Lexicon()
        if path.exists(path.join(util.DATA_DIR, name, 'lexemes')):
            self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes'))
            self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
@ -231,26 +220,11 @@ cdef class Language:


 cdef class Lexicon:
-    def __init__(self, lexemes):
+    def __init__(self):
        self.mem = Pool()
        self._dict = PreshMap(2 ** 20)
        self.strings = StringStore()
        self.size = 1
-        cdef String string
-        cdef Lexeme* lexeme
-        for py_string, lexeme_dict in lexemes.iteritems():
-            string_from_unicode(&string, py_string)
-            lexeme = <Lexeme*>self.mem.alloc(1, sizeof(Lexeme))
-            lexeme[0] = lexeme_init(string.chars[:string.n], string.key, self.strings,
-                                    lexeme_dict)
-            self._dict.set(string.key, lexeme)
-            self.size += 1
-
-    def set(self, unicode py_string, dict lexeme_dict):
-        cdef String string
-        string_from_unicode(&string, py_string)
-        cdef Lexeme* lex = self.get(&string)
-        lex[0] = lexeme_init(string.chars[:string.n], string.key, self.strings, lexeme_dict)

    cdef Lexeme* get(self, String* string) except NULL:
        cdef Lexeme* lex
@ -263,20 +237,18 @@ cdef class Lexicon:
        self.size += 1
        return lex

-    cpdef Lexeme lookup(self, unicode uni_string):
-        """Retrieve (or create, if not found) a Lexeme for a string, and return it.
-    
-       Args
-            string (unicode):  The string to be looked up. Must be unicode, not bytes.
-
-       Returns:
-            lexeme (Lexeme): A reference to a lexical type.
-        """
+    def __getitem__(self, unicode uni_string):
        cdef String string
        string_from_unicode(&string, uni_string)
        cdef Lexeme* lexeme = self.get(&string)
        return lexeme[0]

+    def __setitem__(self, unicode uni_string, dict props):
+        cdef String s
+        string_from_unicode(&s, uni_string)
+        cdef Lexeme* lex = self.get(&s)
+        lex[0] = lexeme_init(s.chars[:s.n], s.key, self.strings, props)
+
    def dump(self, loc):
        if path.exists(loc):
            assert not path.isdir(loc)
@ -316,7 +288,6 @@ cdef class Lexicon:
                break
            self._dict.set(key, lexeme)
            i += 1
-        print "Load %d lexemes" % i
        fclose(fp)
        

--- a/spacy/util.py
+++ b/spacy/util.py
@ -17,14 +17,7 @@ def read_lang_data(name):
    prefix = read_prefix(data_dir)
    suffix = read_suffix(data_dir)
    infix = read_infix(data_dir)
-    
-    lex_loc = path.join(data_dir, 'lexemes.json')
-    if path.exists(lex_loc):
-        with open(lex_loc) as file_:
-            lexemes = ujson.load(file_)
-    else:
-        lexemes = {}
-    return tokenization, prefix, suffix, infix, lexemes
+    return tokenization, prefix, suffix, infix


 def read_prefix(data_dir):