From ea8f1e70536bcd48fb1360343acacc1f757061ba Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Thu, 30 Oct 2014 18:14:42 +1100
Subject: [PATCH] * Tighten interfaces

---
 spacy/en.pyx   |  2 +-
 spacy/lang.pxd |  1 -
 spacy/lang.pyx | 49 ++++++++++---------------------------------------
 spacy/util.py  |  9 +--------
 4 files changed, 12 insertions(+), 49 deletions(-)
diff --git a/spacy/en.pyx b/spacy/en.pyx
index f29e45c9c..95c1cbd94 100644
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@@ -50,4 +50,4 @@ cdef class English(Language):
     pass
 
 
-EN = English('en', [], [])
+EN = English('en')
diff --git a/spacy/lang.pxd b/spacy/lang.pxd
index 9d6419557..4234b04b3 100644
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@@ -27,7 +27,6 @@ cdef class Lexicon:
     cpdef readonly size_t size
     cpdef readonly StringStore strings
 
-    cpdef Lexeme lookup(self, unicode string)
     cdef Lexeme* get(self, String* s) except NULL
     
     cdef PreshMap _dict
diff --git a/spacy/lang.pyx b/spacy/lang.pyx
index 114c10c66..98205b354 100644
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@@ -1,11 +1,5 @@
 # cython: profile=True
 # cython: embedsignature=True
-"""Common classes and utilities across languages.
-
-Provides the main implementation for the spacy tokenizer. Specific languages
-subclass the Language class, over-writing the tokenization rules as necessary.
-Special-case tokenization rules are read from data/<lang>/tokenization .
-"""
 from __future__ import unicode_literals
 
 import json
@@ -24,27 +18,22 @@ from preshed.maps cimport PreshMap
 from .lexeme cimport Lexeme
 from .lexeme cimport init as lexeme_init
 
-from . import orth
 from . import util
 from .util import read_lang_data
 from .tokens import Tokens
 
 
 cdef class Language:
-    """Base class for language-specific tokenizers.
-
-    The language's name is used to look up default data-files, found in data/<name.
-    """
     def __init__(self, name):
         self.name = name
         self.mem = Pool()
         self._cache = PreshMap(2 ** 25)
         self._specials = PreshMap(2 ** 16)
-        rules, prefix, suffix, infix, lexemes = util.read_lang_data(name)
+        rules, prefix, suffix, infix = util.read_lang_data(name)
         self._prefix_re = re.compile(prefix)
         self._suffix_re = re.compile(suffix)
         self._infix_re = re.compile(infix)
-        self.lexicon = Lexicon(lexemes)
+        self.lexicon = Lexicon()
         if path.exists(path.join(util.DATA_DIR, name, 'lexemes')):
             self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes'))
             self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
@@ -231,26 +220,11 @@ cdef class Language:
 
 
 cdef class Lexicon:
-    def __init__(self, lexemes):
+    def __init__(self):
         self.mem = Pool()
         self._dict = PreshMap(2 ** 20)
         self.strings = StringStore()
         self.size = 1
-        cdef String string
-        cdef Lexeme* lexeme
-        for py_string, lexeme_dict in lexemes.iteritems():
-            string_from_unicode(&string, py_string)
-            lexeme = <Lexeme*>self.mem.alloc(1, sizeof(Lexeme))
-            lexeme[0] = lexeme_init(string.chars[:string.n], string.key, self.strings,
-                                    lexeme_dict)
-            self._dict.set(string.key, lexeme)
-            self.size += 1
-
-    def set(self, unicode py_string, dict lexeme_dict):
-        cdef String string
-        string_from_unicode(&string, py_string)
-        cdef Lexeme* lex = self.get(&string)
-        lex[0] = lexeme_init(string.chars[:string.n], string.key, self.strings, lexeme_dict)
 
     cdef Lexeme* get(self, String* string) except NULL:
         cdef Lexeme* lex
@@ -263,20 +237,18 @@ cdef class Lexicon:
         self.size += 1
         return lex
 
-    cpdef Lexeme lookup(self, unicode uni_string):
-        """Retrieve (or create, if not found) a Lexeme for a string, and return it.
-    
-       Args
-            string (unicode):  The string to be looked up. Must be unicode, not bytes.
-
-       Returns:
-            lexeme (Lexeme): A reference to a lexical type.
-        """
+    def __getitem__(self, unicode uni_string):
         cdef String string
         string_from_unicode(&string, uni_string)
         cdef Lexeme* lexeme = self.get(&string)
         return lexeme[0]
 
+    def __setitem__(self, unicode uni_string, dict props):
+        cdef String s
+        string_from_unicode(&s, uni_string)
+        cdef Lexeme* lex = self.get(&s)
+        lex[0] = lexeme_init(s.chars[:s.n], s.key, self.strings, props)
+
     def dump(self, loc):
         if path.exists(loc):
             assert not path.isdir(loc)
@@ -316,7 +288,6 @@ cdef class Lexicon:
                 break
             self._dict.set(key, lexeme)
             i += 1
-        print "Load %d lexemes" % i
         fclose(fp)
         
 
diff --git a/spacy/util.py b/spacy/util.py
index d06911400..5062ca6db 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -17,14 +17,7 @@ def read_lang_data(name):
     prefix = read_prefix(data_dir)
     suffix = read_suffix(data_dir)
     infix = read_infix(data_dir)
-    
-    lex_loc = path.join(data_dir, 'lexemes.json')
-    if path.exists(lex_loc):
-        with open(lex_loc) as file_:
-            lexemes = ujson.load(file_)
-    else:
-        lexemes = {}
-    return tokenization, prefix, suffix, infix, lexemes
+    return tokenization, prefix, suffix, infix
 
 
 def read_prefix(data_dir):