From 4c6ce7ee84acd4f2f47eaddb453ffc8786a41070 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sat, 20 Dec 2014 07:03:26 +1100
Subject: [PATCH] * Update tokens.pyx as part of reorg

---
 spacy/tokens.pxd | 47 ++++++++++++++++-------------------------------
 spacy/tokens.pyx | 30 ++++++++----------------------
 2 files changed, 24 insertions(+), 53 deletions(-)

diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd
index 9a0e09f92..12eb70cc1 100644
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@@ -4,27 +4,12 @@ import numpy as np
 cimport numpy as np
 
 from cymem.cymem cimport Pool
-from thinc.typedefs cimport atom_t
 
-from .lexeme cimport Lexeme
+from .structs cimport Lexeme, TokenC, Morphology
 
-from .typedefs cimport flags_t
-from .typedefs cimport Morphology
-from .lang cimport Language
+from .typedefs cimport flags_t, attr_t, flags_t
 
-
-
-cdef struct TokenC:
-    const Lexeme* lex
-    Morphology morph
-    int idx
-    int pos
-    int lemma
-    int sense
-    int head
-    int dep_tag
-    uint32_t l_kids
-    uint32_t r_kids
+from .strings cimport StringStore
 
 
 ctypedef const Lexeme* const_Lexeme_ptr
@@ -37,7 +22,7 @@ ctypedef fused LexemeOrToken:
 
 cdef class Tokens:
     cdef Pool mem
-    cdef Language lang
+    cdef StringStore strings
     cdef list tag_names
 
     cdef TokenC* data
@@ -51,7 +36,7 @@ cdef class Tokens:
 
 
 cdef class Token:
-    cdef public Language lang
+    cdef public StringStore strings
     cdef public int i
     cdef public int idx
     cdef int pos
@@ -59,18 +44,18 @@ cdef class Token:
     cdef public int head
     cdef public int dep_tag
 
-    cdef public atom_t id
-    cdef public atom_t cluster
-    cdef public atom_t length
-    cdef public atom_t postype
-    cdef public atom_t sensetype
+    cdef public attr_t id
+    cdef public attr_t cluster
+    cdef public attr_t length
+    cdef public attr_t postype
+    cdef public attr_t sensetype
 
-    cdef public atom_t sic
-    cdef public atom_t norm
-    cdef public atom_t shape
-    cdef public atom_t asciied
-    cdef public atom_t prefix
-    cdef public atom_t suffix
+    cdef public attr_t sic
+    cdef public attr_t norm
+    cdef public attr_t shape
+    cdef public attr_t asciied
+    cdef public attr_t prefix
+    cdef public attr_t suffix
 
     cdef public float prob
 
diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx
index e0d320b30..f4b1c952d 100644
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@@ -2,7 +2,7 @@
 from preshed.maps cimport PreshMap
 from preshed.counter cimport PreshCounter
 
-from .lexeme cimport *
+from .lexeme cimport get_attr, EMPTY_LEXEME, LEMMA, attr_id_t
 cimport cython
 
 import numpy as np
@@ -30,8 +30,8 @@ cdef class Tokens:
     >>> from spacy.en import EN
     >>> tokens = EN.tokenize('An example sentence.')
     """
-    def __init__(self, Language lang, string_length=0):
-        self.lang = lang
+    def __init__(self, StringStore string_store, string_length=0):
+        self.string_store = string_store
         if string_length >= 3:
             size = int(string_length / 3.0)
         else:
@@ -50,7 +50,7 @@ cdef class Tokens:
 
     def __getitem__(self, i):
         bounds_check(i, self.length, PADDING)
-        return Token(self.lang, i, self.data[i].idx, self.data[i].pos,
+        return Token(self.string_store, i, self.data[i].idx, self.data[i].pos,
                      self.data[i].lemma, self.data[i].head, self.data[i].dep_tag,
                      self.data[i].lex[0])
 
@@ -97,20 +97,6 @@ cdef class Tokens:
             counts.inc(attr, 1)
         return dict(counts)
 
-    def base_nps(self):
-        # Iterate backwards, looking for nouns, and if we're collecting, for an
-        # outside-NP word. We want greedy matching, so it's easier to find the noun.
-        cdef TokenC* token 
-        cdef int end = -1
-        for i in range(self.length-1, -1, -1):
-            token = &self.data[i]
-            if end == -1:
-                if self.lang.is_base_np_end(token):
-                    end = i
-            elif self.lang.is_outside_base_np(token):
-                yield i-1, end
-                end = -1
-
     def _realloc(self, new_size):
         self.max_length = new_size
         n = new_size + (PADDING * 2)
@@ -129,9 +115,9 @@ cdef class Tokens:
 
 @cython.freelist(64)
 cdef class Token:
-    def __init__(self, Language lang, int i, int idx,
+    def __init__(self, StringStore string_store, int i, int idx,
                  int pos, int lemma, int head, int dep_tag, dict lex):
-        self.lang = lang
+        self.string_store = string_store
         self.idx = idx
         self.pos = pos
         self.i = i
@@ -158,14 +144,14 @@ cdef class Token:
         def __get__(self):
             if self.sic == 0:
                 return ''
-            cdef bytes utf8string = self.lang.lexicon.strings[self.sic]
+            cdef bytes utf8string = self.string_store[self.sic]
             return utf8string.decode('utf8')
 
     property lemma:
         def __get__(self):
             if self.lemma == 0:
                 return self.string
-            cdef bytes utf8string = self.lang.lexicon.strings[self.lemma]
+            cdef bytes utf8string = self.string_store[self.lemma]
             return utf8string.decode('utf8')
 
     property pos: