From 7c09c73a14e55f9eedda0d7664deeabe53474b0e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <matthew@honnibal.com>
Date: Wed, 10 Sep 2014 18:27:44 +0200
Subject: [PATCH] * Refactor to use tokens class.

---
 spacy/en.pyx          | 5 ++++-
 spacy/lang.pyx        | 6 +++---
 spacy/orth.py         | 4 ++--
 tests/test_asciify.py | 8 ++++----
 tests/test_orth.py    | 6 ++----
 5 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/spacy/en.pyx b/spacy/en.pyx
index 497c9e350..62e195ca8 100644
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@@ -218,6 +218,9 @@ cdef class English(Language):
         name (unicode): The two letter code used by Wikipedia for the language.
         lexicon (Lexicon): The lexicon. Exposes the lookup method.
     """
+    fl_is_alpha = Flag_IsAlpha
+    fl_is_digit = Flag_IsDigit
+    v_shape = View_WordShape
     def __cinit__(self, name, user_string_features, user_flag_features):
         self.cache = {}
         lang_data = util.read_lang_data(name)
@@ -226,7 +229,7 @@ cdef class English(Language):
                                      STRING_VIEW_FUNCS + user_string_features,
                                      FLAG_FUNCS + user_flag_features)
         self._load_special_tokenization(rules)
-        self.token_class = EnglishTokens
+        self.tokens_class = EnglishTokens
 
     cdef int _split_one(self, unicode word):
         cdef size_t length = len(word)
diff --git a/spacy/lang.pyx b/spacy/lang.pyx
index 3d02b7677..b3d6dcd0e 100644
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@@ -43,7 +43,7 @@ cdef class Language:
         self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
                                string_features, flag_features)
         self._load_special_tokenization(rules)
-        self.token_class = Tokens
+        self.tokens_class = Tokens
 
     property nr_types:
         def __get__(self):
@@ -81,7 +81,7 @@ cdef class Language:
         cdef size_t length = len(string)
         cdef size_t start = 0
         cdef size_t i = 0
-        cdef Tokens tokens = self.token_class()
+        cdef Tokens tokens = self.tokens_class()
         for c in string:
             if c == ' ':
                 if start < i:
@@ -91,7 +91,7 @@ cdef class Language:
         if start < i:
             self._tokenize(tokens, string[start:i])
         assert tokens
-        return tokens
+        return tokens.lexemes
 
     cdef _tokenize(self, Tokens tokens, unicode string):
         cdef list lexemes
diff --git a/spacy/orth.py b/spacy/orth.py
index b7106d609..685de191c 100644
--- a/spacy/orth.py
+++ b/spacy/orth.py
@@ -56,7 +56,7 @@ def oft_case(name, thresh):
     return wrapped
 
 
-def can_tag(name, thresh):
+def can_tag(name, thresh=0.5):
     def wrapped(string, prob, case_stats, tag_stats):
         return string
     return wrapped
@@ -111,7 +111,7 @@ def non_sparse(string, prob, cluster, case_stats, tag_stats):
         return word_shape(string, prob, cluster, case_stats, tag_stats)
 
 
-def asciied(string):
+def asciied(string, prob=0, cluster=0, case_stats=None, tag_stats=None):
     '''"ASCIIfy" a Unicode string by stripping all umlauts, tildes, etc.''' 
     # Snippet from
     # http://www.physic.ut.ee/~kkannike/english/prog/python/util/asciify/index.html
diff --git a/tests/test_asciify.py b/tests/test_asciify.py
index eed71a5f3..d03af0d25 100644
--- a/tests/test_asciify.py
+++ b/tests/test_asciify.py
@@ -3,16 +3,16 @@
 from __future__ import unicode_literals
 import pytest
 
-from spacy.orth import asciify
+from spacy.orth import asciied
 
 
 def test_tilde():
     string = u'hõmbre'
-    assert asciify(string) == u'hombre'
+    assert asciied(string) == u'hombre'
 
 
 def test_smart_quote():
     string = u'“'
-    assert asciify(string) == '"'
+    assert asciied(string) == '"'
     string = u'”'
-    assert asciify(string) == '"'
+    assert asciied(string) == '"'
diff --git a/tests/test_orth.py b/tests/test_orth.py
index 33cd4014a..0840af683 100644
--- a/tests/test_orth.py
+++ b/tests/test_orth.py
@@ -3,9 +3,7 @@ from __future__ import unicode_literals
 import pytest
 
 import spacy.word
-from spacy import en
-
-EN = en.EN
+from spacy.en import EN
 
 
 @pytest.fixture
@@ -14,7 +12,7 @@ def C3P0():
 
 
 def test_shape(C3P0):
-    assert C3P0.string_view(en.SHAPE) == "XdXd"
+    assert C3P0.string_view(EN.v_shape) == "XdXd"
 
 
 def test_length():