From 71ee92105598a6372ea3eb2edea1e56f1f0a83b6 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 10 Oct 2014 19:17:22 +1100
Subject: [PATCH] * Slight cleaning of tokenizer code

---
 spacy/lang.pxd   |  2 +-
 spacy/lang.pyx   | 38 +++++++++++++++++++-------------------
 spacy/lexeme.pxd |  3 ++-
 spacy/lexeme.pyx |  6 +++---
 spacy/tokens.pxd |  1 +
 spacy/tokens.pyx |  5 ++++-
 spacy/word.pyx   |  2 +-
 7 files changed, 31 insertions(+), 26 deletions(-)
diff --git a/spacy/lang.pxd b/spacy/lang.pxd
index 906b9231f..fc41e7851 100644
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@@ -41,6 +41,7 @@ cdef class Lexicon:
 cdef class Language:
     cdef Pool _mem
     cdef unicode name
+    cdef vector[size_t] counts
     cdef PreshMap cache
     cdef PreshMap specials
     cpdef readonly Lexicon lexicon
@@ -51,7 +52,6 @@ cdef class Language:
     cpdef Tokens tokenize(self, unicode text)
     cpdef Lexeme lookup(self, unicode text)
 
-    cdef int _check_cache(self, vector[LexemeC*] *tokens, String* string) except -1
     cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1
     cdef int _find_prefix(self, Py_UNICODE* characters, size_t length)
     cdef int _find_suffix(self, Py_UNICODE* characters, size_t length)
diff --git a/spacy/lang.pyx b/spacy/lang.pyx
index 73f5d358a..831d79999 100644
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@@ -16,6 +16,7 @@ import re
 from .util import read_lang_data
 from spacy.tokens import Tokens
 from spacy.lexeme cimport LexemeC, get_lexeme_dict, lexeme_pack, lexeme_unpack
+from spacy.lexeme cimport LexStr_orig
 from murmurhash.mrmr cimport hash64
 
 from cpython.ref cimport Py_INCREF
@@ -45,12 +46,20 @@ cdef class Language:
         self.suffix_re = re.compile(suffix)
         self.lexicon = Lexicon(lexemes)
         self._load_special_tokenization(rules)
+        self.counts = vector[size_t]()
 
     property nr_types:
         def __get__(self):
             """Return the number of lexical types in the vocabulary"""
             return self.lexicon.size
 
+    property counts:
+        def __get__(self):
+            cdef size_t i
+            for i in range(self.lexicon.size):
+                count = self.counts[i] if i < self.counts.size() else 0
+                yield count, self.lexicon.lexemes[i].strings[<int>LexStr_orig].decode('utf8')
+
     cpdef Lexeme lookup(self, unicode string):
         """Retrieve (or create, if not found) a Lexeme for a string, and return it.
     
@@ -85,23 +94,23 @@ cdef class Language:
         cdef size_t start = 0
         cdef size_t i = 0
         cdef Py_UNICODE* chars = string
-        cdef Py_UNICODE c
         cdef String span
         for i in range(length):
-            c = chars[i]
-            if Py_UNICODE_ISSPACE(c) == 1:
+            if Py_UNICODE_ISSPACE(chars[i]) == 1:
                 if start < i:
                     string_from_slice(&span, chars, start, i)
-                    try:
-                        self._tokenize(tokens.v, &span)
-                    except MemoryError:
-                        print chars[start:i]
-                        raise
+                    self._tokenize(tokens.v, &span)
                 start = i + 1
         i += 1
         if start < i:
             string_from_slice(&span, chars, start, i)
             self._tokenize(tokens.v, &span)
+        cdef int id_
+        for i in range(tokens.v.size()):
+            id_ = tokens.id(i)
+            while id_ >= self.counts.size():
+                self.counts.push_back(0)
+            self.counts[id_] += 1
         return tokens
 
     cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1:
@@ -163,17 +172,6 @@ cdef class Language:
         self._attach_tokens(tokens_v, string, &prefixes, &suffixes)
         self._save_cached(tokens_v, orig_key, orig_size)
 
-    cdef int _check_cache(self, vector[LexemeC*] *tokens, String* string) except -1:
-        lexemes = <LexemeC**>self.cache.get(string.key)
-        cdef size_t i = 0
-        if lexemes != NULL:
-            while lexemes[i] != NULL:
-                tokens.push_back(lexemes[i])
-                i += 1
-            string.n = 0
-            string.key = 0
-            string.chars = NULL
-
     cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string,
                             vector[LexemeC*] *prefixes,
                             vector[LexemeC*] *suffixes) except -1:
@@ -261,6 +259,7 @@ cdef class Lexicon:
             lexeme = <LexemeC*>self._mem.alloc(1, sizeof(LexemeC))
             lexeme_unpack(lexeme, lexeme_dict)
             self._dict.set(string.key, lexeme)
+            self.lexemes.push_back(lexeme)
             self.size += 1
 
     cdef LexemeC* get(self, String* string) except NULL:
@@ -273,6 +272,7 @@ cdef class Lexicon:
         cdef unicode unicode_string = string.chars[:string.n]
         lexeme_unpack(lex, get_lexeme_dict(self.size, unicode_string))
         self._dict.set(string.key, lex)
+        self.lexemes.push_back(lex)
         self.size += 1
         return lex
 
diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index d7c85619d..11b40e0e8 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -21,7 +21,7 @@ cpdef enum LexFloats:
 
 
 cpdef enum LexStrs:
-    LexStr_key
+    LexStr_orig
     LexStr_casefix
     LexStr_shape
     LexStr_unsparse
@@ -70,6 +70,7 @@ cdef struct LexemeC:
     flag_t orth_flags
     flag_t dist_flags
 
+
 cpdef dict get_lexeme_dict(size_t i, unicode string)
 
 cdef char* intern_and_encode(unicode string, size_t* length) except NULL
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index b84ed4a02..8df0e554c 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -19,8 +19,8 @@ cpdef dict get_lexeme_dict(size_t i, unicode string):
     floats[<int>LexFloat_sentiment] = 0
 
     strings = [None for _ in range(LexStr_N)]
-    strings[<int>LexStr_key] = string
-    strings[<int>LexStr_casefix] = strings[<int>LexStr_key]
+    strings[<int>LexStr_orig] = string
+    strings[<int>LexStr_casefix] = strings[<int>LexStr_orig]
     strings[<int>LexStr_shape] = orth.word_shape(string)
     strings[<int>LexStr_unsparse] = strings[<int>LexStr_shape]
     strings[<int>LexStr_asciied] = orth.asciied(string)
@@ -42,9 +42,9 @@ def get_orth_flags(unicode string):
     flags |= orth.is_space(string) << LexOrth_space
     flags |= orth.is_title(string) << LexOrth_title
     flags |= orth.is_upper(string) << LexOrth_upper
-    
     return flags
 
+
 def get_dist_flags(unicode string):
     return 0
 
diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd
index b138387bf..8fd58ea8c 100644
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@@ -5,6 +5,7 @@ from libcpp.vector cimport vector
 cdef class Tokens:
     cdef vector[LexemeC*] *v
 
+    cpdef int id(self, size_t i) except -1
     cpdef unicode string(self, size_t i)
     cpdef float prob(self, size_t i) except 1
     cpdef int cluster(self, size_t i) except *
diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx
index 18f0c1533..c15ad7de1 100644
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@@ -40,8 +40,11 @@ cdef class Tokens:
     def append(self, Lexeme lexeme):
         self.v.push_back(lexeme._c)
 
+    cpdef int id(self, size_t i) except -1:
+        return self.v.at(i).ints[<int>LexInt_i]
+
     cpdef unicode string(self, size_t i):
-        cdef bytes utf8_string = self.v.at(i).strings[<int>LexStr_key]
+        cdef bytes utf8_string = self.v.at(i).strings[<int>LexStr_orig]
         cdef unicode string = utf8_string.decode('utf8')
         return string
 
diff --git a/spacy/word.pyx b/spacy/word.pyx
index 617e8809f..ab4ee6b68 100644
--- a/spacy/word.pyx
+++ b/spacy/word.pyx
@@ -54,7 +54,7 @@ cdef class Lexeme:
 
     property string:
         def __get__(self):
-            cdef bytes utf8_string = self._c.strings[<int>LexStr_key]
+            cdef bytes utf8_string = self._c.strings[<int>LexStr_orig]
             cdef unicode string = utf8_string.decode('utf8')
             return string