* More refactoring

2025-07-26 07:59:47 +03:00 · 2014-08-25 16:42:22 +02:00 · 2014-08-25 16:42:22 +02:00 · 68bae2fec6
commit 68bae2fec6
parent 88095666dc
18 changed files with 358 additions and 864 deletions
--- a/setup.py
+++ b/setup.py
@ -45,13 +45,13 @@ else:


 exts = [
-    #Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.en", ["spacy/en.pyx"], language="c++",
-              include_dirs=includes),
-    Extension("spacy.ptb3", ["spacy/ptb3.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes),
+    Extension("spacy.lang", ["spacy/spacy.pyx"], language="c++", include_dirs=includes),
    Extension("spacy.word", ["spacy/word.pyx"], language="c++",
              include_dirs=includes),
+    Extension("spacy.ptb3", ["spacy/ptb3.pyx"], language="c++", include_dirs=includes),
+    Extension("spacy.en", ["spacy/en.pyx"], language="c++",
+              include_dirs=includes),
+
 ]


--- a/spacy/_hashing.pxd
+++ b/spacy/_hashing.pxd
@ -1,25 +0,0 @@
-from libc.stdint cimport uint64_t
-
-from chartree cimport CharTree
-
-
-cdef class FixedTable:
-    cdef size_t size
-    cdef uint64_t* keys
-    cdef size_t* values
-
-    cdef size_t insert(self, uint64_t key, size_t value) nogil
-    cdef size_t get(self, uint64_t key) nogil
-    cdef int erase(self, uint64_t key) nogil
-
-
-cdef class WordTree:
-    cdef size_t max_length
-    cdef size_t default
-    cdef CharTree* _trees
-    cdef dict _dict
-
-    cdef size_t get(self, unicode string) except *
-    cdef int set(self, unicode string, size_t value) except *
-    cdef bint contains(self, unicode string) except *
-    
--- a/spacy/_hashing.pyx
+++ b/spacy/_hashing.pyx
@ -1,98 +0,0 @@
-from libc.stdlib cimport calloc, free
-import cython
-
-cimport chartree
-
-
-cdef class FixedTable:
-    def __cinit__(self, const size_t size):
-        self.size = size
-        self.keys = <uint64_t*>calloc(self.size, sizeof(uint64_t))
-        self.values = <size_t*>calloc(self.size, sizeof(size_t))
-
-    def __dealloc__(self):
-        free(self.keys)
-        free(self.values)
-
-    def __getitem__(self, uint64_t key):
-        return self.get(key)
-
-    def __setitem__(self, uint64_t key, size_t value):
-        self.insert(key, value)
-
-    def pop(self, uint64_t key):
-        self.delete(key)
-
-    def bucket(self, uint64_t key):
-        return _find(key, self.size)
-
-    cdef size_t insert(self, uint64_t key, size_t value) nogil:
-        cdef size_t bucket = _find(key, self.size)
-        cdef size_t clobbered
-        if self.values[bucket] == value:
-            clobbered = 0
-        else:
-            clobbered = self.values[bucket]
-        self.keys[bucket] = key
-        self.values[bucket] = value
-        return clobbered
-
-    cdef size_t get(self, uint64_t key) nogil:
-        cdef size_t bucket = _find(key, self.size)
-        if self.keys[bucket] == key:
-            return self.values[bucket]
-        else:
-            return 0
-
-    cdef int erase(self, uint64_t key) nogil:
-        cdef size_t bucket = _find(key, self.size)
-        self.keys[bucket] = 0
-        self.values[bucket] = 0
-
-
-@cython.cdivision
-cdef inline size_t _find(uint64_t key, size_t size) nogil:
-    return key % size
-
-
-cdef class WordTree:
-    def __cinit__(self, size_t default, size_t max_length):
-        self.max_length = max_length
-        self.default = default
-        self._trees = <CharTree*>calloc(max_length, sizeof(CharTree))
-        for i in range(self.max_length):
-            chartree.init(&self._trees[i], i)
-        self._dict = {}
-
-    cdef size_t get(self, unicode ustring) except *:
-        cdef bytes bstring = ustring.encode('utf8')
-        cdef size_t length = len(bstring)
-        if length >= self.max_length:
-            return self._dict.get(bstring, 0)
-        else:
-            return chartree.getitem(&self._trees[length], bstring)
-
-    cdef int set(self, unicode ustring, size_t value) except *:
-        cdef bytes bstring = ustring.encode('utf8')
-        cdef size_t length = len(bstring)
-        if length >= self.max_length:
-            self._dict[bstring] = value
-        else:
-            chartree.setitem(&self._trees[length], bstring, value)
-
-    cdef bint contains(self, unicode ustring) except *:
-        cdef bytes bstring = ustring.encode('utf8')
-        cdef size_t length = len(bstring)
-        if length >= self.max_length:
-            return bstring in self._dict
-        else:
-            return chartree.contains(&self._trees[length], bstring)
-
-    def __getitem__(self, unicode key):
-        return self.get(key)
-
-    def __setitem__(self, unicode key, size_t value):
-        self.set(key, value)
-    
-    def __contains__(self, unicode key):
-        return self.contains(key)
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -1,15 +1,38 @@
-from libcpp.vector cimport vector
-
-from spacy.spacy cimport StringHash
-
 from spacy.spacy cimport Language
-from spacy.word cimport LatinWord
+from spacy.word cimport Lexeme
 cimport cython


+cpdef size_t ALPHA
+cpdef size_t DIGIT 
+cpdef size_t PUNCT
+cpdef size_t SPACE
+cpdef size_t LOWER
+cpdef size_t UPPER
+cpdef size_t TITLE
+cpdef size_t ASCII
+
+cpdef size_t OFT_LOWER
+cpdef size_t OFT_TITLE
+cpdef size_t OFT_UPPER
+
+cpdef size_t PUNCT
+cpdef size_t CONJ
+cpdef size_t NUM
+cpdef size_t N
+cpdef size_t DET
+cpdef size_t ADP
+cpdef size_t ADJ
+cpdef size_t ADV
+cpdef size_t VERB
+cpdef size_t NOUN
+cpdef size_t PDT
+cpdef size_t POS
+cpdef size_t PRON
+cpdef size_t PRT
+
 cdef class English(spacy.Language):
    cdef int find_split(self, unicode word)
-    cdef LatinWord new_lexeme(self, unicode string)


 cdef English EN
@ -17,4 +40,3 @@ cdef English EN

 cpdef Word lookup(unicode word)
 cpdef list tokenize(unicode string)
-cpdef unicode unhash(StringHash hash_value)
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -43,9 +43,85 @@ from libc.stdint cimport uint64_t
 cimport spacy


+# Python-readable flag constants --- can't read an enum from Python
+
+# Don't want to manually assign these numbers, or we'll insert one and have to
+# change them all.
+# Don't use "i", as we don't want it in the global scope!
+cdef size_t __i = 0
+
+ALPHA = __i; i += 1
+DIGIT = __i; __i += 1
+PUNCT = __i; __i += 1
+SPACE = __i; __i += 1
+LOWER = __i; __i += 1
+UPPER = __i; __i += 1
+TITLE = __i; __i += 1
+ASCII = __i; __i += 1
+
+OFT_LOWER = __i; __i += 1 
+OFT_UPPER = __i; __i += 1
+OFT_TITLE = __i; __i += 1
+
+PUNCT = __i; __i += 1
+CONJ = __i; __i += 1
+NUM = __i; __i += 1
+X = __i; __i += 1
+DET = __i; __i += 1
+ADP = __i; __i += 1
+ADJ = __i; __i += 1
+ADV = __i; __i += 1
+VERB = __i; __i += 1
+NOUN = __i; __i += 1
+PDT = __i; __i += 1
+POS = __i; __i += 1
+PRON = __i; __i += 1
+PRT = __i; __i += 1
+
+
+# These are for the string views
+__i = 0
+SIC = __i; __i += 1
+CANON_CASED = __i; __i += 1
+NON_SPARSE = __i; __i += 1
+SHAPE = __i; __i += 1
+NR_STRING_VIEWS = __i
+
+
+def get_string_views(unicode string, lexeme):
+    views = ['' for _ in range(NR_STRING_VIEWS)]
+    views[SIC] = string
+    views[CANON_CASED] = canonicalize_case(string, lexeme)
+    views[SHAPE] = get_string_shape(string)
+    views[NON_SPARSE] = get_non_sparse(string, views[CANON_CASED], views[SHAPE],
+                                       lexeme)
+    return views
+
+
+def set_orth_flags(unicode string, flags_t flags)
+    setters = [
+        (ALPHA, is_alpha),
+        (DIGIT, is_digit),
+        (PUNCT, is_punct),
+        (SPACE, is_space),
+        (LOWER, is_lower),
+        (UPPER, is_upper),
+        (SPACE, is_space)
+    ]
+
+    for bit, setter in setters:
+        if setter(string):
+            flags |= 1 << bit
+    return flags
+
+
+
+
 cdef class English(spacy.Language):
-    cdef LatinWord new_lexeme(self, unicode string):
-        return LatinWord(string)
+    cdef Lexeme new_lexeme(self, unicode string, cluster=0, prob=0, case_stats=None,
+                           tag_freqs=None):
+        return Lexeme(s, length, views, prob=prob, cluster=cluster,
+                      flags=self.get_flags(string))

    cdef int find_split(self, unicode word):
        cdef size_t length = len(word)
@ -101,7 +177,7 @@ cpdef list tokenize(unicode string):
    return EN.tokenize(string)


-cpdef Word lookup(unicode string):
+cpdef Lexeme lookup(unicode string):
    """Retrieve (or create, if not found) a Lexeme for a string, and return its ID.

    Properties of the Lexeme are accessed by passing LexID to the accessor methods.
@ -116,23 +192,6 @@ cpdef Word lookup(unicode string):
    return EN.lookup(string)


-cpdef unicode unhash(StringHash hash_value):
-    """Retrieve a string from a hash value. Mostly used for testing.
-
-    In general you should avoid computing with strings, as they are slower than
-    the intended ID-based usage. However, strings can be recovered if necessary,
-    although no control is taken for hash collisions.
-
-    Args:
-        hash_value (StringHash): The hash of a string, returned by Python's hash()
-        function.
-
-    Returns:
-        string (unicode): A unicode string that hashes to the hash_value.
-    """
-    return EN.unhash(hash_value)
-
-
 def add_string_views(view_funcs):
    """Add a string view to existing and previous lexical entries.

@ -150,16 +209,19 @@ def load_clusters(location):
    """
    pass

+
 def load_unigram_probs(location):
    """Load unigram probabilities.
    """
    pass

+
 def load_case_stats(location):
    """Load case stats.
    """
    pass

+
 def load_tag_stats(location):
    """Load tag statistics.
    """
--- a/spacy/spacy.pxd
+++ b/spacy/spacy.pxd
@ -1,16 +1,12 @@
 from libc.stdint cimport uint32_t
 from libc.stdint cimport uint64_t
-from spacy.word cimport Word
-
-ctypedef uint32_t StringHash
-
+from spacy.word cimport Lexeme


 cdef class Language:
    cdef object name
-    cdef dict chunks
-    cdef dict vocab
-    cdef dict bacov
+    cdef dict blobs
+    cdef dict lexicon

    cpdef list tokenize(self, unicode text)

@ -20,8 +16,5 @@ cdef class Language:
    cdef list new_chunk(self, unicode string, list substrings)
    cdef Word new_lexeme(self, unicode lex)
    
-    cpdef unicode unhash(self, StringHash hashed)
-    
    cpdef list find_substrings(self, unicode chunk)
    cdef int find_split(self, unicode word)
-    cdef int set_orth(self, unicode string, Word word)
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@ -15,16 +15,13 @@ from libc.stdlib cimport calloc, free
 from . import util
 from os import path

-TAGS = {}
-DIST_FLAGS = {}

 cdef class Language:
    view_funcs = []
    def __cinit__(self, name):
        self.name = name
-        self.bacov = {}
-        self.chunks = {}
-        self.vocab = {}
+        self.blobs = {}
+        self.lexicon = {}
        self.load_tokenization(util.read_tokenization(name))
        self.load_dist_info(util.read_dist_info(name))

@ -37,26 +34,26 @@ cdef class Language:
            string (unicode): The string to split.

        Returns:
-            tokens (Tokens): A Tokens object.
+            tokens (list): A list of Lexeme objects.
        """
-        cdef list chunk
+        cdef list blob
        cdef list tokens = []
        cdef size_t length = len(string)
        cdef size_t start = 0
        cdef size_t i = 0
        for c in string:
-            if _is_whitespace(c):
+            if c == ' ':
                if start < i:
-                    chunk = self.lookup_chunk(string[start:i])
-                    tokens.extend(chunk)
+                    blob = self.lookup_blob(string[start:i])
+                    tokens.extend(blob)
                start = i + 1
            i += 1
        if start < i:
-            chunk = self.lookup_chunk(string[start:])
+            chunk = self.lookup_blob(string[start:])
            tokens.extend(chunk)
        return tokens

-    cdef Word lookup(self, unicode string):
+    cdef Lexeme lookup(self, unicode string):
        assert len(string) != 0
        cdef Word word 
        if string in self.vocab:
@ -65,28 +62,26 @@ cdef class Language:
            word = self.new_lexeme(string)
        return word

-    cdef list lookup_chunk(self, unicode string):
+    cdef list lookup_blob(self, unicode string):
        cdef list chunk
-        cdef size_t chunk_id
-        if string in self.chunks:
-            chunk = self.chunks[string]
+        cdef size_t blob_id
+        if string in self.blobs:
+            blob = self.blobs[string]
        else:
-            chunk = self.new_chunk(string, self.find_substrings(string))
+            blob = self.new_blob(string, self.find_substrings(string))
        return chunk

-    cdef list new_chunk(self, unicode string, list substrings):
-        chunk = []
+    cdef list new_blob(self, unicode string, list substrings):
+        blob = []
        for i, substring in enumerate(substrings):
-            chunk.append(self.lookup(substring))
-        self.chunks[string] = chunk
-        return chunk
+            blob.append(self.lookup(substring))
+        self.blobs[string] = chunk
+        return blob

    cdef Word new_lexeme(self, unicode string):
-        string_views = [view_func(string) for view_func in self.view_funcs]
-        word = Word(string.encode('utf8'), string_views)
-        self.bacov[word.lex] = string
-        self.vocab[string] = word
-        return word
+        # TODO
+        #lexeme = Lexeme(string.encode('utf8'), string_views)
+        #return lexeme

    """
    def add_view_funcs(self, list view_funcs):
@ -112,11 +107,7 @@ cdef class Language:
                self.bacov[hashed] = view
    """

-    cpdef unicode unhash(self, StringHash hash_value):
-        '''Fetch a string from the reverse index, given its hash value.'''
-        return self.bacov[hash_value]
-
-    cpdef list find_substrings(self, unicode chunk):
+    cpdef list find_substrings(self, unicode blob):
        """Find how to split a chunk into substrings.

        This method calls find_split repeatedly. Most languages will want to
@ -129,21 +120,18 @@ cdef class Language:
            substrings (list): The component substrings, e.g. [u"Mike", "'s", "!"].
        """
        substrings = []
-        while chunk:
-            split = self.find_split(chunk)
+        while blob:
+            split = self.find_split(blob)
            if split == 0:
-                substrings.append(chunk)
+                substrings.append(blob)
                break
-            substrings.append(chunk[:split])
-            chunk = chunk[split:]
+            substrings.append(blob[:split])
+            blob = blob[split:]
        return substrings

    cdef int find_split(self, unicode word):
        return len(word)

-    cdef int set_orth(self, unicode string, Word word):
-        pass
-
    def load_tokenization(self, token_rules):
        '''Load special-case tokenization rules.

@ -178,22 +166,3 @@ cdef class Language:
                w.dist_flags |= DIST_FLAGS[flag]
            for tag in word_dist.tagdict:
                w.possible_tags |= TAGS[tag]
-
-
-cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
-    if c == ' ':
-        return True
-    elif c == '\n':
-        return True
-    elif c == '\t':
-        return True
-    else:
-        return False
-
-
-#cdef inline int _extend(Tokens tokens, Lexeme** chunk) nogil:
-#    cdef size_t i = 0
-#    while chunk[i] != NULL:
-#        tokens.vctr[0].push_back(<Lexeme_addr>chunk[i])
-#        tokens.length += 1
-#        i += 1
--- a/spacy/orthography/init.pxd
+++ b/spacy/orthography/init.pxd
--- a/spacy/orthography/init.py
+++ b/spacy/orthography/init.py
--- a/spacy/orthography/init.pyx
+++ b/spacy/orthography/init.pyx
--- a/spacy/orthography/latin.pxd
+++ b/spacy/orthography/latin.pxd
@ -1,32 +0,0 @@
-cdef enum OrthFlag:
-    IS_ALPHA
-    IS_DIGIT
-    IS_PUNCT
-    IS_SPACE
-    IS_LOWER
-    IS_UPPER
-    IS_TITLE
-    IS_ASCII
-
-
-cdef enum:
-    NORM
-    SHAPE
-    LAST3
-
-from spacy.lexeme cimport LexID
-from spacy.lexeme cimport StringHash
-
-cpdef bint is_alpha(LexID lex_id) except *
-cpdef bint is_digit(LexID lex_id) except *
-cpdef bint is_punct(LexID lex_id) except *
-cpdef bint is_space(LexID lex_id) except *
-cpdef bint is_lower(LexID lex_id) except *
-cpdef bint is_upper(LexID lex_id) except *
-cpdef bint is_title(LexID lex_id) except *
-cpdef bint is_ascii(LexID lex_id) except *
-
-
-cpdef StringHash norm_of(LexID lex_id) except 0
-cpdef StringHash shape_of(LexID lex_id) except 0
-cpdef StringHash last3_of(LexID lex_id) except 0
--- a/spacy/orthography/latin.pyx
+++ b/spacy/orthography/latin.pyx
@ -1,211 +0,0 @@
-# cython: embedsignature=True
-from __future__ import unicode_literals
-
-from spacy.lexeme cimport Lexeme
-
-def get_normalized(unicode word):
-    """Todo.
-
-    Args:
-        word (unicode)
-
-    Returns:
-        normalized (unicode)
-    """
-    if word.isalpha() and word.islower():
-        return word
-    else:
-        return get_word_shape(word)
-
-
-def get_word_shape(unicode word):
-    """Todo.
-
-    Args:
-        word (unicode)
-
-    Returns:
-        shape (unicode)
-    """
-    cdef size_t length = len(word)
-    shape = ""
-    last = ""
-    shape_char = ""
-    seq = 0
-    for c in word:
-        if c.isalpha():
-            if c.isupper():
-                shape_char = "X"
-            else:
-                shape_char = "x"
-        elif c.isdigit():
-            shape_char = "d"
-        else:
-            shape_char = c
-        if shape_char == last:
-            seq += 1
-        else:
-            seq = 0
-            last = shape_char
-        if seq < 3:
-            shape += shape_char
-    assert shape
-    return shape
-
-
-cpdef unicode get_last3(unicode string):
-    return string[-3:]
-
-
-cpdef bint is_alpha(LexID lex_id) except *:
-    """Check whether all characters in the word's string are alphabetic.
-    
-    Should match the :py:func:`unicode.isalpha()` function.
-
-    >>> is_alpha(lookup(u'Hello'))
-    True
-    >>> is_alpha(lookup(u'العرب'))
-    True
-    >>> is_alpha(lookup(u'10'))
-    False
-    """
-    return (<Lexeme*>lex_id).orth_flags & 1 << IS_ALPHA
-
-
-cpdef bint is_digit(LexID lex_id) except *:
-    """Check whether all characters in the word's string are numeric.
-    
-    Should match the :py:func:`unicode.isdigit()` function.
-
-    >>> is_digit(lookup(u'10'))
-    True
-    >>> is_digit(lookup(u'๐'))
-    True
-    >>> is_digit(lookup(u'one'))
-    False
-    """
-    return (<Lexeme*>lex_id).orth_flags & 1 << IS_DIGIT
-
-
-cpdef bint is_punct(LexID lex_id) except *:
-    """Check whether all characters belong to a punctuation unicode data category
-    for a Lexeme ID.
-
-    >>> is_punct(lookup(u'.'))
-    True
-    >>> is_punct(lookup(u'⁒'))
-    True
-    >>> is_punct(lookup(u' '))
-    False
-    """
-    return (<Lexeme*>lex_id).orth_flags & 1 << IS_PUNCT
-
-
-cpdef bint is_space(LexID lex_id) except *:
-    """Give the result of unicode.isspace() for a Lexeme ID.
-
-    >>> is_space(lookup(u'\\t'))
-    True
-    >>> is_space(lookup(u'<unicode space>'))
-    True
-    >>> is_space(lookup(u'Hi\\n'))
-    False
-    """
-    return (<Lexeme*>lex_id).orth_flags & 1 << IS_SPACE
-
-
-cpdef bint is_lower(LexID lex_id) except *:
-    """Give the result of unicode.islower() for a Lexeme ID.
-
-    >>> is_lower(lookup(u'hi'))
-    True
-    >>> is_lower(lookup(<unicode>))
-    True
-    >>> is_lower(lookup(u'10'))
-    False
-    """
-    return (<Lexeme*>lex_id).orth_flags & 1 << IS_LOWER
-
-
-cpdef bint is_upper(LexID lex_id) except *:
-    """Give the result of unicode.isupper() for a Lexeme ID.
-
-    >>> is_upper(lookup(u'HI'))
-    True
-    >>> is_upper(lookup(u'H10'))
-    True
-    >>> is_upper(lookup(u'10'))
-    False
-    """
-    return (<Lexeme*>lex_id).orth_flags & 1 << IS_UPPER
-
-
-cpdef bint is_title(LexID lex_id) except *:
-    """Give the result of unicode.istitle() for a Lexeme ID.
-
-    >>> is_title(lookup(u'Hi'))
-    True
-    >>> is_title(lookup(u'Hi1'))
-    True
-    >>> is_title(lookup(u'1'))
-    False
-    """
-    return (<Lexeme*>lex_id).orth_flags & 1 << IS_TITLE
-
-
-cpdef bint is_ascii(LexID lex_id) except *:
-    """Give the result of checking whether all characters in the string are ascii.
-
-    >>> is_ascii(lookup(u'Hi'))
-    True
-    >>> is_ascii(lookup(u' '))
-    True
-    >>> is_title(lookup(u'<unicode>'))
-    False
-    """
-    return (<Lexeme*>lex_id).orth_flags & 1 << IS_ASCII
-
-
-cpdef StringHash norm_of(LexID lex_id) except 0:
-    """Return the hash of a "normalized" version of the string.
-
-    Normalized strings are intended to be less sparse, while still capturing
-    important lexical information.  See :py:func:`spacy.latin.orthography.normalize_string`
-    for details of the normalization function.
-
-    >>> unhash(norm_of(lookupu'Hi'))
-    u'hi'
-    >>> unhash(norm_of(lookup(u'255667')))
-    u'shape=dddd'
-    >>> unhash(norm_of(lookup(u'...')))
-    u'...'
-    """
-    return (<Lexeme*>lex_id).string_views[NORM]
-
-
-cpdef StringHash shape_of(LexID lex_id) except 0:
-    """Return the hash of a string describing the word's "orthograpgic shape".
-
-    Orthographic shapes are calculated by the :py:func:`spacy.orthography.latin.string_shape`
-    function. Word shape features have been found useful for NER and POS tagging,
-    e.g. Manning (2011)
-
-    >>> unhash(shape_of(lookupu'Hi'))
-    u'Xx'
-    >>> unhash(shape_of(lookup(u'255667')))
-    u'dddd'
-    >>> unhash(shape_of(lookup(u'...')))
-    u'...'
-    """
-    cdef Lexeme* w = <Lexeme*>lex_id
-    return w.string_views[SHAPE]
-
-
-cpdef StringHash last3_of(LexID lex_id) except 0:
-    '''Return the hash of string[-3:], i.e. the last three characters of the word.
-
-    >>> lex_ids = [lookup(w) for w in (u'Hello', u'!')]
-    >>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
-    [u'llo', u'!']
-    '''
-    return (<Lexeme*>lex_id).string_views[LAST3]
--- a/spacy/string_tools.pxd
+++ b/spacy/string_tools.pxd
@ -1,7 +0,0 @@
-cpdef bytes to_bytes(unicode string)
-
-cpdef unicode from_bytes(bytes string)
-
-cpdef unicode substr(unicode string, int start, int end, size_t length)
-
-cdef bint is_whitespace(Py_UNICODE c)
--- a/spacy/string_tools.pyx
+++ b/spacy/string_tools.pyx
@ -1,35 +0,0 @@
-# cython: profile=True
-
-cpdef bytes to_bytes(unicode string):
-    return string.encode('utf8')
-
-
-cpdef unicode from_bytes(bytes string):
-    return string.decode('utf8')
-
-
-cpdef unicode substr(unicode string, int start, int end, size_t length):
-    if end >= length:
-        end = -1
-    if start >= length:
-        start = 0
-    if start <= 0 and end < 0:
-        return string
-    elif start < 0:
-        start = 0
-    elif end < 0:
-        end = length
-    return string[start:end]
-  
-
-cdef bint is_whitespace(Py_UNICODE c):
-    # TODO: Support other unicode spaces
-    # https://www.cs.tut.fi/~jkorpela/chars/spaces.html
-    if c == u' ':
-        return True
-    elif c == u'\n':
-        return True
-    elif c == u'\t':
-        return True
-    else:
-        return False
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -1,18 +0,0 @@
-from libcpp.vector cimport vector
-from spacy.lexeme cimport LexID
-from spacy.lexeme cimport Lexeme
-
-from cython.operator cimport dereference as deref
-from spacy.spacy cimport Language
-
-
-cdef class Tokens:
-    cdef Language lang
-    cdef vector[LexID]* vctr
-    cdef size_t length
-    
-    cpdef int append(self, LexID token)
-    cpdef int extend(self, Tokens other) except -1
-    
-    cpdef object group_by(self, size_t attr)
-    cpdef dict count_by(self, size_t attr)
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -1,92 +0,0 @@
-from cython.operator cimport dereference as deref
-from cython.operator cimport preincrement as inc
-
-
-from spacy.lexeme cimport Lexeme
-from spacy.spacy cimport StringHash
-
-
-cdef class Tokens:
-    def __cinit__(self, Language lang):
-        self.lang = lang
-        self.vctr = new vector[LexID]()
-        self.length = 0
-
-    def __dealloc__(self):
-        del self.vctr
-
-    def __iter__(self):
-        cdef vector[LexID].iterator it = self.vctr[0].begin()
-        while it != self.vctr[0].end():
-            yield deref(it)
-            inc(it)
-
-    def __getitem__(self, size_t idx):
-        return self.vctr[0].at(idx)
-
-    def __len__(self):
-        return self.length
-
-    cpdef int append(self, LexID token):
-        self.vctr[0].push_back(token)
-        self.length += 1
-
-    cpdef int extend(self, Tokens other) except -1:
-        cdef LexID el
-        for el in other:
-            self.append(el)
-
-    cpdef object group_by(self, size_t view_idx):
-        '''Group tokens that share the property attr into Tokens instances, and
-        return a list of them. Returns a tuple of three lists:
-        
-        (string names, hashes, tokens)
-
-        The lists are aligned, so the ith entry in string names is the string
-        that the ith entry in hashes unhashes to, which the Tokens instance
-        is grouped by.
-        
-        You can then use count_by or group_by on the Tokens
-        for further processing. Calling group_by and then asking the length
-        of the Tokens objects is equivalent to count_by, but somewhat slower.
-        '''
-        # Implementation here is working around some of the constraints in
-        # Cython about what type of thing can go in what type of container.
-        # Long story short, it's pretty hard to get a Python object like
-        # Tokens into a vector or array. If we really need this to run faster,
-        # we can be tricky and get the Python list access out of the loop. What
-        # we'd do is store pointers to the underlying vectors.
-        # So far, speed isn't mattering here.
-        cdef dict indices = {}
-        cdef list groups = []
-        cdef list names = []
-        cdef list hashes = []
-
-        cdef StringHash key
-        cdef LexID t
-        for t in self.vctr[0]:
-            if view_idx == 0:
-                key = (<Lexeme*>t).lex
-            else:
-                key = (<Lexeme*>t).string_views[view_idx - 1]
-            if key in indices:
-                groups[indices[key]].append(t)
-            else:
-                indices[key] = len(groups)
-                groups.append(Tokens(self.lang))
-                names.append(self.lang.unhash(key))
-                hashes.append(key)
-                groups[-1].append(t)
-        return names, hashes, groups
-
-    cpdef dict count_by(self, size_t attr):
-        counts = {}
-        cdef LexID t
-        cdef StringHash key
-        for t in self.vctr[0]:
-            #key = attr_of(t, attr)
-            key = 0
-            if key not in counts:
-                counts[key] = 0
-            counts[key] += 1
-        return counts
--- a/spacy/word.pxd
+++ b/spacy/word.pxd
@ -1,59 +1,25 @@
-from libc.stdint cimport uint32_t
-from libc.stdint cimport uint64_t
-
-ctypedef int ClusterID
-ctypedef uint32_t StringHash
-ctypedef size_t LexID
-ctypedef char OrthFlags
-ctypedef char DistFlags
-ctypedef uint64_t TagFlags
+from .typedefs cimport hash_t, utf8_t, flag_t, id_t


-cdef enum OrthFlag:
-    IS_ALPHA
-    IS_DIGIT
-    IS_PUNCT
-    IS_SPACE
-    IS_LOWER
-    IS_UPPER
-    IS_TITLE
-    IS_ASCII
+DEF MAX_FLAG = 64


-cdef enum:
-    NORM
-    SHAPE
-    LAST3
-
-
-cdef class Word:
+cdef class Lexeme:
    # NB: the readonly keyword refers to _Python_ access. The attributes are
    # writeable from Cython.
-    cdef readonly StringHash key
-    cdef readonly char** utf8_strings
+    cdef readonly id_t id
    cdef readonly size_t length
    cdef readonly double prob
-    cdef readonly ClusterID cluster
-    cdef readonly TagFlags possible_tags
-    cdef readonly DistFlags dist_flags
-    cdef readonly OrthFlags orth_flags
+    cdef readonly size_t cluster

-    cpdef StringHash get_view(self, size_t i) except 0
+    cdef readonly utf8_t* strings
+    cdef readonly size_t nr_strings

+    cdef readonly flag_t flags

-cdef class CasedWord(Word):
-    cpdef bint can_tag(self, TagFlags flag) except *
-    cpdef bint check_dist_flag(self, DistFlags flag) except *
-    cpdef bint check_orth_flag(self, OrthFlags flag) except *
-
-    cpdef bint is_often_titled(self) except *
-    cpdef bint is_often_uppered(self) except *
-
-    cpdef bint is_alpha(self) except *
-    cpdef bint is_digit(self) except *
-    cpdef bint is_punct(self) except *
-    cpdef bint is_space(self) except *
-    cpdef bint is_lower(self) except *
-    cpdef bint is_upper(self) except *
-    cpdef bint is_title(self) except *
-    cpdef bint is_ascii(self) except *
+    cpdef bint check_flag(self, size_t flag_id) except *
+    cpdef int set_flag(self, size_t flag_id) except -1
+    
+    cpdef unicode get_string(self, size_t i) except *
+    cpdef id_t get_id(self, size_t i) except 0
+    cpdef int add_strings(self, list strings) except -1
--- a/spacy/word.pyx
+++ b/spacy/word.pyx
@ -4,40 +4,32 @@

 from libc.stdlib cimport calloc, free

-
-# Python-visible enum for POS tags
-PUNCT = 0
-CONJ = 1
-NUM = 2
-X = 3
-DET = 4
-ADP = 5
-ADJ = 6
-ADV = 7
-VERB = 8
-NOUN = 9
-PDT = 10
-POS = 11
-PRON = 12
-PRT = 13
+from spacy cimport flags


-DEF OFT_UPPER = 1
-DEF OFT_TITLE = 2
-
-
-cdef class Word:
+cdef class Lexeme:
    """A lexical type.

+    Clients should avoid instantiating Lexemes directly, and instead use get_lexeme
+    from a language module, e.g. spacy.en.get_lexeme . This allows us to use only
+    one Lexeme object per lexical type.
+
    Attributes:
-        string (bytes):
-            A utf8-encoded byte-string for the word.
-        
-        lex (StringHash):
-            A hash of the word.
+        id (view_id_t):
+            A unique ID of the word's string.
+
+            Implemented as the memory-address of the string,
+            as we use Python's string interning to guarantee that only one copy
+            of each string is seen.
+
+        string (unicode):
+            The unicode string.
+            
+            Implemented as a property; relatively expensive.
+
        length (size_t):
-            The (unicode) length of the word.
-        
+            The number of unicode code-points in the string.
+
        prob (double):
            An estimate of the word's unigram log probability.

@ -60,186 +52,194 @@ cdef class Word:
            while "dapple" is totally different. On the other hand, "scalable" receives
            the same cluster ID as "pineapple", which is not what we'd like.
    """
-    def __cinit__(self, bytes string, list string_views, prob=0.0, cluster=0,
-                  orth_flags=0, dist_flags=0, possible_tags=0):
-        self.string = <char*>string
-        self.length = len(string)
-        self.views = <char**>calloc(len(string_views), sizeof(StringHash))
-        cdef unicode view
-        for i in range(len(string_views)):
-            view = string_views[i]
-            self.string_views[i] = hash(view)
+    def __cinit__(self, utf8_t string, size_t length, list views, prob=0.0,
+                  cluster=0, orth_flags=0, dist_flags=0, possible_tags=0):
+        self.id = <id_t>&string
+        self.length = length
+        self.nr_strings = 0
+        self.add_views(views)

    def __dealloc__(self):
-        free(self.string_views)
+        free(self.views)

-    cpdef StringHash get_view(self, size_t i) except 0:
-        return self.string_views[i]
+    property string:
+        def __get__(self):
+            return self.strings[0].decode('utf8')

-    cpdef bint check_orth_flag(self, OrthFlags flag) except *:
-        """Access the value of one of the pre-computed boolean orthographic features.
+    cpdef unicode get_view_string(self, size_t i) except *:
+        assert i < self.nr_strings
+        return self.strings[i].decode('utf8')

-        Meanings depend on the language-specific orthographic features being loaded.
-        The suggested features for latin-alphabet languages are: TODO
-        """
-        return self.orth_flags & (1 << flag)
+    cpdef intptr_t get_view_id(self, size_t i) except 0:
+        assert i < self.nr_strings
+        return <string_id_t>&self.views[i]

-    cpdef bint check_dist_flag(self, DistFlags flag) except *:
+    cpdef int add_views(self, list views) except -1:
+        self.nr_views += len(strings)
+        self.views = <char**>realloc(self.views, self.nr_views * sizeof(utf8_t))
+        cdef unicode view
+        cdef bytes utf8_string
+        for i, view in enumerate(strings):
+            view = string_views[i]
+            utf8_string = view.encode('utf8')
+            # Intern strings, allowing pointer comparison
+            utf8_string = intern(utf8_string)
+            self.views[i] = utf8_string
+
+    cpdef bint check_flag(self, size_t flag_id) except *:
        """Access the value of one of the pre-computed boolean distribution features.

        Meanings depend on the language-specific distributional features being loaded.
        The suggested features for latin-alphabet languages are: TODO
        """
- 
-        return self.dist_flags & (1 << flag)
+        assert flag_id < flags.MAX_FLAG
+        return self.flags & (1 << flag_id)

-    cpdef bint can_tag(self, TagFlags flag) except *:
-        """Check whether the word often receives a particular tag in a large text
-        corpus. "Often" is chosen by heuristic.
-        """
-        return self.possible_tags & (1 << flag)
+    cpdef int set_flag(self, size_t flag_id) except -1:
+        assert flag_id < flags.MAX_FLAG
+        self.flags |= (1 << flag_id)


-cdef class CasedWord(Word):
-    def __cinit__(self, bytes string):
-        string_views = [get_normaized(string), get_word_shape(string), string[-3:]]
-        Word.__cinit__(self, string, string_views)
-    
-    cpdef bint is_often_uppered(self) except *:
-        '''Check the OFT_UPPER distributional flag for the word.
-    
-        The OFT_UPPER flag records whether a lower-cased version of the word
-        is found in all-upper case frequently in a large sample of text, where
-        "frequently" is defined as P >= 0.95 (chosen for high mutual information for
-        POS tagging).
-    
-        Case statistics are estimated from a large text corpus. Estimates are read
-        from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
-    
-        >>> is_often_uppered(lookup(u'nato'))
-        True
-        >>> is_often_uppered(lookup(u'the')) 
-        False
-        '''
-        return self.dist_flags & (1 << OFT_UPPER)
-
-
-    cpdef bint is_often_titled(self) except *:
-        '''Check the OFT_TITLE distributional flag for the word.
-    
-        The OFT_TITLE flag records whether a lower-cased version of the word
-        is found title-cased (see string.istitle) frequently in a large sample of text,
-        where "frequently" is defined as P >= 0.3 (chosen for high mutual information for
-        POS tagging).
-    
-        Case statistics are estimated from a large text corpus. Estimates are read
-        from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
-    
-        >>> is_oft_upper(lookup(u'john'))
-        True
-        >>> is_oft_upper(lookup(u'Bill')) 
-        False
-        '''
-        return self.dist_flags & (1 << OFT_TITLE)
-
-
-    cpdef bint is_alpha(self) except *:
-        """Check whether all characters in the word's string are alphabetic.
-        
-        Should match the :py:func:`unicode.isalpha()` function.
-
-        >>> is_alpha(lookup(u'Hello'))
-        True
-        >>> is_alpha(lookup(u'العرب'))
-        True
-        >>> is_alpha(lookup(u'10'))
-        False
-        """
-        return self.orth_flags & 1 << IS_ALPHA
-
-    cpdef bint is_digit(self) except *:
-        """Check whether all characters in the word's string are numeric.
-    
-        Should match the :py:func:`unicode.isdigit()` function.
-
-        >>> is_digit(lookup(u'10'))
-        True
-        >>> is_digit(lookup(u'๐'))
-        True
-        >>> is_digit(lookup(u'one'))
-        False
-        """
-        return self.orth_flags & 1 << IS_DIGIT
-
-    cpdef bint is_punct(self) except *:
-        """Check whether all characters belong to a punctuation unicode data category
-        for a Lexeme ID.
-
-        >>> is_punct(lookup(u'.'))
-        True
-        >>> is_punct(lookup(u'⁒'))
-        True
-        >>> is_punct(lookup(u' '))
-        False
-        """
-        return self.orth_flags & 1 << IS_PUNCT
-
-    cpdef bint is_space(self) except *:
-        """Give the result of unicode.isspace() for a Lexeme ID.
-
-        >>> is_space(lookup(u'\\t'))
-        True
-        >>> is_space(lookup(u'<unicode space>'))
-        True
-        >>> is_space(lookup(u'Hi\\n'))
-        False
-        """
-        return self.orth_flags & 1 << IS_SPACE
-
-    cpdef bint is_lower(self) except *:
-        """Give the result of unicode.islower() for a Lexeme ID.
-
-        >>> is_lower(lookup(u'hi'))
-        True
-        >>> is_lower(lookup(<unicode>))
-        True
-        >>> is_lower(lookup(u'10'))
-        False
-        """
-        return self.orth_flags & 1 << IS_LOWER
-
-    cpdef bint is_upper(self) except *:
-        """Give the result of unicode.isupper() for a Lexeme ID.
-
-        >>> is_upper(lookup(u'HI'))
-        True
-        >>> is_upper(lookup(u'H10'))
-        True
-        >>> is_upper(lookup(u'10'))
-        False
-        """
-        return self.orth_flags & 1 << IS_UPPER
-
-    cpdef bint is_title(self) except *:
-        """Give the result of unicode.istitle() for a Lexeme ID.
-
-        >>> is_title(lookup(u'Hi'))
-        True
-        >>> is_title(lookup(u'Hi1'))
-        True
-        >>> is_title(lookup(u'1'))
-        False
-        """
-        return self.orth_flags & 1 << IS_TITLE
-
-    cpdef bint is_ascii(self) except *:
-        """Give the result of checking whether all characters in the string are ascii.
-
-        >>> is_ascii(lookup(u'Hi'))
-        True
-        >>> is_ascii(lookup(u' '))
-        True
-        >>> is_title(lookup(u'<unicode>'))
-        False
-        """
-        return self.orth_flags & 1 << IS_ASCII
+#
+#cdef class CasedWord(Word):
+#    def __cinit__(self, bytes string, list views):
+#        Word.__cinit__(self, string, string_views)
+#    
+#    cpdef bint is_often_uppered(self) except *:
+#        '''Check the OFT_UPPER distributional flag for the word.
+#    
+#        The OFT_UPPER flag records whether a lower-cased version of the word
+#        is found in all-upper case frequently in a large sample of text, where
+#        "frequently" is defined as P >= 0.95 (chosen for high mutual information for
+#        POS tagging).
+#    
+#        Case statistics are estimated from a large text corpus. Estimates are read
+#        from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
+#    
+#        >>> is_often_uppered(lookup(u'nato'))
+#        True
+#        >>> is_often_uppered(lookup(u'the')) 
+#        False
+#        '''
+#        return self.dist_flags & (1 << OFT_UPPER)
+#
+#
+#    cpdef bint is_often_titled(self) except *:
+#        '''Check the OFT_TITLE distributional flag for the word.
+#    
+#        The OFT_TITLE flag records whether a lower-cased version of the word
+#        is found title-cased (see string.istitle) frequently in a large sample of text,
+#        where "frequently" is defined as P >= 0.3 (chosen for high mutual information for
+#        POS tagging).
+#    
+#        Case statistics are estimated from a large text corpus. Estimates are read
+#        from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
+#    
+#        >>> is_oft_upper(lookup(u'john'))
+#        True
+#        >>> is_oft_upper(lookup(u'Bill')) 
+#        False
+#        '''
+#        return self.dist_flags & (1 << OFT_TITLE)
+#
+#
+#    cpdef bint is_alpha(self) except *:
+#        """Check whether all characters in the word's string are alphabetic.
+#        
+#        Should match the :py:func:`unicode.isalpha()` function.
+#
+#        >>> is_alpha(lookup(u'Hello'))
+#        True
+#        >>> is_alpha(lookup(u'العرب'))
+#        True
+#        >>> is_alpha(lookup(u'10'))
+#        False
+#        """
+#        return self.orth_flags & 1 << IS_ALPHA
+#
+#    cpdef bint is_digit(self) except *:
+#        """Check whether all characters in the word's string are numeric.
+#    
+#        Should match the :py:func:`unicode.isdigit()` function.
+#
+#        >>> is_digit(lookup(u'10'))
+#        True
+#        >>> is_digit(lookup(u'๐'))
+#        True
+#        >>> is_digit(lookup(u'one'))
+#        False
+#        """
+#        return self.orth_flags & 1 << IS_DIGIT
+#
+#    cpdef bint is_punct(self) except *:
+#        """Check whether all characters belong to a punctuation unicode data category
+#        for a Lexeme ID.
+#
+#        >>> is_punct(lookup(u'.'))
+#        True
+#        >>> is_punct(lookup(u'⁒'))
+#        True
+#        >>> is_punct(lookup(u' '))
+#        False
+#        """
+#        return self.orth_flags & 1 << IS_PUNCT
+#
+#    cpdef bint is_space(self) except *:
+#        """Give the result of unicode.isspace() for a Lexeme ID.
+#
+#        >>> is_space(lookup(u'\\t'))
+#        True
+#        >>> is_space(lookup(u'<unicode space>'))
+#        True
+#        >>> is_space(lookup(u'Hi\\n'))
+#        False
+#        """
+#        return self.orth_flags & 1 << IS_SPACE
+#
+#    cpdef bint is_lower(self) except *:
+#        """Give the result of unicode.islower() for a Lexeme ID.
+#
+#        >>> is_lower(lookup(u'hi'))
+#        True
+#        >>> is_lower(lookup(<unicode>))
+#        True
+#        >>> is_lower(lookup(u'10'))
+#        False
+#        """
+#        return self.orth_flags & 1 << IS_LOWER
+#
+#    cpdef bint is_upper(self) except *:
+#        """Give the result of unicode.isupper() for a Lexeme ID.
+#
+#        >>> is_upper(lookup(u'HI'))
+#        True
+#        >>> is_upper(lookup(u'H10'))
+#        True
+#        >>> is_upper(lookup(u'10'))
+#        False
+#        """
+#        return self.orth_flags & 1 << IS_UPPER
+#
+#    cpdef bint is_title(self) except *:
+#        """Give the result of unicode.istitle() for a Lexeme ID.
+#
+#        >>> is_title(lookup(u'Hi'))
+#        True
+#        >>> is_title(lookup(u'Hi1'))
+#        True
+#        >>> is_title(lookup(u'1'))
+#        False
+#        """
+#        return self.orth_flags & 1 << IS_TITLE
+#
+#    cpdef bint is_ascii(self) except *:
+#        """Give the result of checking whether all characters in the string are ascii.
+#
+#        >>> is_ascii(lookup(u'Hi'))
+#        True
+#        >>> is_ascii(lookup(u' '))
+#        True
+#        >>> is_title(lookup(u'<unicode>'))
+#        False
+#        """
+#        return self.orth_flags & 1 << IS_ASCII