From fdaf24604a2496e0666a53232e997df05560c2e1 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <matthew@honnibal.com>
Date: Wed, 27 Aug 2014 19:38:57 +0200
Subject: [PATCH] * Basic punct tests updated and passing

---
 spacy/__init__.py            | 14 -------------
 spacy/en.pxd                 | 37 ---------------------------------
 spacy/en.pyx                 |  8 ++++----
 spacy/lang.pxd               |  4 ++++
 spacy/lang.pyx               | 18 +++++++++++-----
 spacy/word.pxd               |  2 +-
 spacy/word.pyx               | 40 +++++++++++++++++++++---------------
 tests/test_post_punct.py     | 24 ++++++++++------------
 tests/test_pre_punct.py      | 28 ++++++++++++-------------
 tests/test_surround_punct.py | 26 +++++++++++------------
 10 files changed, 81 insertions(+), 120 deletions(-)

diff --git a/spacy/__init__.py b/spacy/__init__.py
index 16d71aec6..e69de29bb 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -1,14 +0,0 @@
-from .lexeme import lex_of
-from .lexeme import length_of
-
-from .tokens import Tokens
-
-# Don't know how to get the enum Python visible :(
-
-LEX = 0
-NORM = 1
-SHAPE = 2
-LAST3 = 3
-LENGTH = 4
-
-__all__ = [Tokens, lex_of, length_of, LEX, NORM, SHAPE, LAST3, LENGTH]
diff --git a/spacy/en.pxd b/spacy/en.pxd
index 2c9f4c718..1a08834ec 100644
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@@ -3,42 +3,5 @@ from spacy.word cimport Lexeme
 cimport cython
 
 
-cpdef size_t ALPHA
-cpdef size_t DIGIT 
-cpdef size_t PUNCT
-cpdef size_t SPACE
-cpdef size_t LOWER
-cpdef size_t UPPER
-cpdef size_t TITLE
-cpdef size_t ASCII
-
-cpdef size_t OFT_LOWER
-cpdef size_t OFT_TITLE
-cpdef size_t OFT_UPPER
-
-cpdef size_t PUNCT
-cpdef size_t CONJ
-cpdef size_t NUM
-cpdef size_t N
-cpdef size_t DET
-cpdef size_t ADP
-cpdef size_t ADJ
-cpdef size_t ADV
-cpdef size_t VERB
-cpdef size_t NOUN
-cpdef size_t PDT
-cpdef size_t POS
-cpdef size_t PRON
-cpdef size_t PRT
-
-cpdef size_t SIC
-cpdef size_t CANON_CASED
-cpdef size_t SHAPE
-cpdef size_t NON_SPARSE
-
-
 cdef class English(Language):
     cpdef int _split_one(self, unicode word)
-
-
-cpdef English EN
diff --git a/spacy/en.pyx b/spacy/en.pyx
index c4185968f..98f96610a 100644
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@@ -84,10 +84,10 @@ EN = English('en')
 
 
 # Thresholds for frequency related flags
-TAG_THRESH = 0.5
-LOWER_THRESH = 0.5
-UPPER_THRESH = 0.3
-TITLE_THRESH = 0.9
+cdef double TAG_THRESH = 0.5
+cdef double LOWER_THRESH = 0.5
+cdef double UPPER_THRESH = 0.3
+cdef double TITLE_THRESH = 0.9
 
 
 # Python-readable flag constants --- can't read an enum from Python
diff --git a/spacy/lang.pxd b/spacy/lang.pxd
index 841e18818..e86fc926e 100644
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@@ -4,6 +4,10 @@ from spacy.word cimport Lexeme
 
 
 cdef class Lexicon:
+    cdef public dict probs
+    cdef public dict clusters
+    cdef public dict case_stats
+    cdef public dict tag_stats
     cdef public list flag_checkers
     cdef public list string_transformers
 
diff --git a/spacy/lang.pyx b/spacy/lang.pyx
index 3713e9320..8e64ca828 100644
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@@ -20,7 +20,7 @@ cdef class Language:
         self.name = name
         self.cache = {}
         self.lexicon = Lexicon()
-        self.load_tokenization(util.read_tokenization(name))
+        #self.load_special_tokenization(util.read_tokenization(name))
 
     cpdef list tokenize(self, unicode string):
         """Tokenize a string.
@@ -57,7 +57,7 @@ cdef class Language:
         cdef list lexemes = []
         substrings = self._split(string)
         for i, substring in enumerate(substrings):
-            lexemes.append(self.lookup(substring))
+            lexemes.append(self.lexicon.lookup(substring))
         self.cache[string] = lexemes
         return lexemes
 
@@ -108,7 +108,11 @@ cdef class Language:
 cdef class Lexicon:
     def __cinit__(self):
         self.flag_checkers = []
-        self.string_transforms = []
+        self.string_transformers = []
+        self.probs = {}
+        self.clusters = {}
+        self.case_stats = {}
+        self.tag_stats = {}
         self.lexicon = {}
 
     cpdef Lexeme lookup(self, unicode string):
@@ -151,6 +155,7 @@ cdef class Lexicon:
     def load_probs(self, location):
         """Load unigram probabilities.
         """
+        # Dict mapping words to floats
         self.probs = json.load(location)
         
         cdef Lexeme word
@@ -161,18 +166,21 @@ cdef class Lexicon:
             word.prob = prob
 
     def load_clusters(self, location):
-        self.probs = json.load(location)
+        # TODO: Find out endianness
+        # Dict mapping words to ??-endian ints
+        self.clusters = json.load(location)
         
         cdef Lexeme word
         cdef unicode string
 
         for string, word in self.lexicon.items():
-            cluster = _pop_default(self.cluster, string, 0)
+            cluster = _pop_default(self.clusters, string, 0)
             word.cluster = cluster
 
     def load_stats(self, location):
         """Load distributional stats.
         """
+        # Dict mapping string to dict of arbitrary stuff.
         raise NotImplementedError
 
 
diff --git a/spacy/word.pxd b/spacy/word.pxd
index 4e9d416fa..bdddfd53e 100644
--- a/spacy/word.pxd
+++ b/spacy/word.pxd
@@ -12,7 +12,7 @@ cdef class Lexeme:
     cpdef readonly double prob
     cpdef readonly size_t cluster
 
-    cdef utf8_t* views
+    cdef list views
     cdef size_t nr_views
 
     cdef readonly flag_t flags
diff --git a/spacy/word.pyx b/spacy/word.pyx
index 99c0845a3..d411e96c8 100644
--- a/spacy/word.pyx
+++ b/spacy/word.pyx
@@ -49,35 +49,41 @@ cdef class Lexeme:
             while "dapple" is totally different. On the other hand, "scalable" receives
             the same cluster ID as "pineapple", which is not what we'd like.
     """
-    def __cinit__(self, utf8_t string, size_t length, list views, prob=0.0,
-                  flags=0):
-        self.id = <id_t>&string
-        self.length = length
-        self.nr_strings = 0
-        self.add_views(views)
+    def __cinit__(self, unicode string, prob, cluster, case_stats,
+                  tag_stats, flag_checkers, string_transformers):
+        self.prob = prob
+        self.cluster = cluster
+        self.length = len(string)
+        self.id = hash(string)
+
+        self.nr_views = len(string_transformers)
+        self.views = []
+        cdef unicode view
+        for i, string_transformer in enumerate(string_transformers):
+            view = string_transformer(string, prob, case_stats, tag_stats)
+            self.views.append(view)
+
+        for i, flag_checker in enumerate(flag_checkers):
+            if flag_checker(string, prob, case_stats, tag_stats):
+                self.set_flag(i)
 
     def __dealloc__(self):
-        free(self.views)
+        pass
 
     property string:
         def __get__(self):
-            return self.strings[0].decode('utf8')
+            return self.views[0]
 
     cpdef unicode get_view_string(self, size_t i):
-        assert i < self.nr_strings
-        return self.strings[i].decode('utf8')
+        assert i < self.nr_views
+        return self.views[i]
 
     cpdef id_t get_view_id(self, size_t i) except 0:
-        assert i < self.nr_strings
-        return <id_t>&self.views[i]
+        return <id_t>hash(self.views[i])
 
     cpdef int add_view(self, unicode view) except -1:
         self.nr_views += 1
-        self.views = <char**>realloc(self.views, self.nr_views * sizeof(utf8_t))
-        cdef bytes utf8_string = view.encode('utf8')
-        # Intern strings, allowing pointer comparison
-        utf8_string = intern(utf8_string)
-        self.views[self.nr_views - 1] = utf8_string
+        self.views.append(view)
 
     cpdef bint check_flag(self, size_t flag_id) except *:
         """Access the value of one of the pre-computed boolean distribution features.
diff --git a/tests/test_post_punct.py b/tests/test_post_punct.py
index e5d2d0705..5ee0eb066 100644
--- a/tests/test_post_punct.py
+++ b/tests/test_post_punct.py
@@ -1,8 +1,6 @@
 from __future__ import unicode_literals
 
-from spacy.en import lookup
-from spacy.en import tokenize
-from spacy.en import unhash
+from spacy.en import EN
 
 import pytest
 
@@ -16,28 +14,28 @@ def test_close(close_puncts):
     word_str = 'Hello'
     for p in close_puncts:
         string = word_str + p
-        tokens = tokenize(string)
+        tokens = EN.tokenize(string)
         assert len(tokens) == 2
-        assert unhash(tokens[1].lex) == p
-        assert unhash(tokens[0].lex) == word_str
+        assert tokens[1].string == p
+        assert tokens[0].string == word_str
 
 
 def test_two_different_close(close_puncts):
     word_str = 'Hello'
     for p in close_puncts:
         string = word_str + p + "'"
-        tokens = tokenize(string)
+        tokens = EN.tokenize(string)
         assert len(tokens) == 3
-        assert unhash(tokens[0].lex) == word_str
-        assert unhash(tokens[1].lex) == p
-        assert unhash(tokens[2].lex) == "'"
+        assert tokens[0].string == word_str
+        assert tokens[1].string == p
+        assert tokens[2].string == "'"
 
 
 def test_three_same_close(close_puncts):
     word_str = 'Hello'
     for p in close_puncts:
         string = word_str + p + p + p
-        tokens = tokenize(string)
+        tokens = EN.tokenize(string)
         assert len(tokens) == 4
-        assert unhash(tokens[0].lex) == word_str
-        assert unhash(tokens[1].lex) == p
+        assert tokens[0].string == word_str
+        assert tokens[1].string == p
diff --git a/tests/test_pre_punct.py b/tests/test_pre_punct.py
index 83e743c44..557655330 100644
--- a/tests/test_pre_punct.py
+++ b/tests/test_pre_punct.py
@@ -1,8 +1,6 @@
 from __future__ import unicode_literals
 
-from spacy.en import lookup
-from spacy.en import tokenize
-from spacy.en import unhash
+from spacy.en import EN
 
 import pytest
 
@@ -16,35 +14,35 @@ def test_open(open_puncts):
     word_str = 'Hello'
     for p in open_puncts:
         string = p + word_str
-        tokens = tokenize(string)
+        tokens = EN.tokenize(string)
         assert len(tokens) == 2
-        assert unhash(tokens[0].lex) == p
-        assert unhash(tokens[1].lex) == word_str
+        assert tokens[0].string == p
+        assert tokens[1].string == word_str
 
 
 def test_two_different_open(open_puncts):
     word_str = 'Hello'
     for p in open_puncts:
         string = p + "`" + word_str
-        tokens = tokenize(string)
+        tokens = EN.tokenize(string)
         assert len(tokens) == 3
-        assert unhash(tokens[0].lex) == p
-        assert unhash(tokens[1].lex) == "`"
-        assert unhash(tokens[2].lex) == word_str
+        assert tokens[0].string == p
+        assert tokens[1].string == "`"
+        assert tokens[2].string == word_str
 
 
 def test_three_same_open(open_puncts):
     word_str = 'Hello'
     for p in open_puncts:
         string = p + p + p + word_str
-        tokens = tokenize(string)
+        tokens = EN.tokenize(string)
         assert len(tokens) == 4
-        assert unhash(tokens[0].lex) == p
-        assert unhash(tokens[3].lex) == word_str
+        assert tokens[0].string == p
+        assert tokens[3].string == word_str
 
 
 def test_open_appostrophe():
     string = "'The"
-    tokens = tokenize(string)
+    tokens = EN.tokenize(string)
     assert len(tokens) == 2
-    assert unhash(tokens[0].lex) == "'"
+    assert tokens[0].string == "'"
diff --git a/tests/test_surround_punct.py b/tests/test_surround_punct.py
index 686d8cfc2..b7be782f2 100644
--- a/tests/test_surround_punct.py
+++ b/tests/test_surround_punct.py
@@ -1,8 +1,6 @@
 from __future__ import unicode_literals
 
-from spacy.en import tokenize
-from spacy.en import lookup
-from spacy.en import unhash
+from spacy.en import EN
 
 import pytest
 
@@ -16,22 +14,22 @@ def test_token(paired_puncts):
     word_str = 'Hello'
     for open_, close_ in paired_puncts:
         string = open_ + word_str + close_
-        tokens = tokenize(string)
+        tokens = EN.tokenize(string)
         assert len(tokens) == 3
-        assert unhash(tokens[0].lex) == open_
-        assert unhash(tokens[1].lex) == word_str
-        assert unhash(tokens[2].lex) == close_
+        assert tokens[0].string == open_
+        assert tokens[1].string == word_str
+        assert tokens[2].string == close_
 
 
 def test_two_different(paired_puncts):
     word_str = 'Hello'
     for open_, close_ in paired_puncts:
         string = "`" + open_ + word_str + close_ + "'"
-        tokens = tokenize(string)
+        tokens = EN.tokenize(string)
         assert len(tokens) == 5
-        assert unhash(tokens[0].lex) == "`"
-        assert unhash(tokens[1].lex) == open_
-        assert unhash(tokens[2].lex) == word_str
-        assert unhash(tokens[2].lex) == word_str
-        assert unhash(tokens[3].lex) == close_
-        assert unhash(tokens[4].lex) == "'"
+        assert tokens[0].string == "`"
+        assert tokens[1].string == open_
+        assert tokens[2].string == word_str
+        assert tokens[2].string == word_str
+        assert tokens[3].string == close_
+        assert tokens[4].string == "'"