* More refactoring

2025-10-28 22:47:52 +03:00 · 2014-08-25 16:42:22 +02:00 · 2014-08-25 16:42:22 +02:00 · 68bae2fec6
commit 68bae2fec6
parent 88095666dc
18 changed files with 358 additions and 864 deletions
--- a/setup.py
+++ b/setup.py
@ -45,13 +45,13 @@ else:
 exts = [
-    #Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
+    Extension("spacy.lang", ["spacy/spacy.pyx"], language="c++", include_dirs=includes),
    Extension("spacy.en", ["spacy/en.pyx"], language="c++",
              include_dirs=includes),
    Extension("spacy.ptb3", ["spacy/ptb3.pyx"], language="c++", include_dirs=includes),
    Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes),
    Extension("spacy.word", ["spacy/word.pyx"], language="c++",
              include_dirs=includes),
    Extension("spacy.ptb3", ["spacy/ptb3.pyx"], language="c++", include_dirs=includes),
    Extension("spacy.en", ["spacy/en.pyx"], language="c++",
              include_dirs=includes),
 ]
--- a/spacy/_hashing.pxd
+++ b/spacy/_hashing.pxd
@ -1,25 +0,0 @@
 from libc.stdint cimport uint64_t
 from chartree cimport CharTree
 cdef class FixedTable:
    cdef size_t size
    cdef uint64_t* keys
    cdef size_t* values
    cdef size_t insert(self, uint64_t key, size_t value) nogil
    cdef size_t get(self, uint64_t key) nogil
    cdef int erase(self, uint64_t key) nogil
 cdef class WordTree:
    cdef size_t max_length
    cdef size_t default
    cdef CharTree* _trees
    cdef dict _dict
    cdef size_t get(self, unicode string) except *
    cdef int set(self, unicode string, size_t value) except *
    cdef bint contains(self, unicode string) except *
--- a/spacy/_hashing.pyx
+++ b/spacy/_hashing.pyx
@ -1,98 +0,0 @@
 from libc.stdlib cimport calloc, free
 import cython
 cimport chartree
 cdef class FixedTable:
    def __cinit__(self, const size_t size):
        self.size = size
        self.keys = <uint64_t*>calloc(self.size, sizeof(uint64_t))
        self.values = <size_t*>calloc(self.size, sizeof(size_t))
    def __dealloc__(self):
        free(self.keys)
        free(self.values)
    def __getitem__(self, uint64_t key):
        return self.get(key)
    def __setitem__(self, uint64_t key, size_t value):
        self.insert(key, value)
    def pop(self, uint64_t key):
        self.delete(key)
    def bucket(self, uint64_t key):
        return _find(key, self.size)
    cdef size_t insert(self, uint64_t key, size_t value) nogil:
        cdef size_t bucket = _find(key, self.size)
        cdef size_t clobbered
        if self.values[bucket] == value:
            clobbered = 0
        else:
            clobbered = self.values[bucket]
        self.keys[bucket] = key
        self.values[bucket] = value
        return clobbered
    cdef size_t get(self, uint64_t key) nogil:
        cdef size_t bucket = _find(key, self.size)
        if self.keys[bucket] == key:
            return self.values[bucket]
        else:
            return 0
    cdef int erase(self, uint64_t key) nogil:
        cdef size_t bucket = _find(key, self.size)
        self.keys[bucket] = 0
        self.values[bucket] = 0
@cython.cdivision
 cdef inline size_t _find(uint64_t key, size_t size) nogil:
    return key % size
 cdef class WordTree:
    def __cinit__(self, size_t default, size_t max_length):
        self.max_length = max_length
        self.default = default
        self._trees = <CharTree*>calloc(max_length, sizeof(CharTree))
        for i in range(self.max_length):
            chartree.init(&self._trees[i], i)
        self._dict = {}
    cdef size_t get(self, unicode ustring) except *:
        cdef bytes bstring = ustring.encode('utf8')
        cdef size_t length = len(bstring)
        if length >= self.max_length:
            return self._dict.get(bstring, 0)
        else:
            return chartree.getitem(&self._trees[length], bstring)
    cdef int set(self, unicode ustring, size_t value) except *:
        cdef bytes bstring = ustring.encode('utf8')
        cdef size_t length = len(bstring)
        if length >= self.max_length:
            self._dict[bstring] = value
        else:
            chartree.setitem(&self._trees[length], bstring, value)
    cdef bint contains(self, unicode ustring) except *:
        cdef bytes bstring = ustring.encode('utf8')
        cdef size_t length = len(bstring)
        if length >= self.max_length:
            return bstring in self._dict
        else:
            return chartree.contains(&self._trees[length], bstring)
    def __getitem__(self, unicode key):
        return self.get(key)
    def __setitem__(self, unicode key, size_t value):
        self.set(key, value)
    def __contains__(self, unicode key):
        return self.contains(key)
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -1,15 +1,38 @@
 from libcpp.vector cimport vector
 from spacy.spacy cimport StringHash
 from spacy.spacy cimport Language
-from spacy.word cimport LatinWord
+from spacy.word cimport Lexeme
 cimport cython
 cpdef size_t ALPHA
 cpdef size_t DIGIT 
 cpdef size_t PUNCT
 cpdef size_t SPACE
 cpdef size_t LOWER
 cpdef size_t UPPER
 cpdef size_t TITLE
 cpdef size_t ASCII
 cpdef size_t OFT_LOWER
 cpdef size_t OFT_TITLE
 cpdef size_t OFT_UPPER
 cpdef size_t PUNCT
 cpdef size_t CONJ
 cpdef size_t NUM
 cpdef size_t N
 cpdef size_t DET
 cpdef size_t ADP
 cpdef size_t ADJ
 cpdef size_t ADV
 cpdef size_t VERB
 cpdef size_t NOUN
 cpdef size_t PDT
 cpdef size_t POS
 cpdef size_t PRON
 cpdef size_t PRT
 cdef class English(spacy.Language):
    cdef int find_split(self, unicode word)
    cdef LatinWord new_lexeme(self, unicode string)
 cdef English EN
@ -17,4 +40,3 @@ cdef English EN
 cpdef Word lookup(unicode word)
 cpdef list tokenize(unicode string)
 cpdef unicode unhash(StringHash hash_value)
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -43,9 +43,85 @@ from libc.stdint cimport uint64_t
 cimport spacy
 # Python-readable flag constants --- can't read an enum from Python
 # Don't want to manually assign these numbers, or we'll insert one and have to
 # change them all.
 # Don't use "i", as we don't want it in the global scope!
 cdef size_t __i = 0
 ALPHA = __i; i += 1
 DIGIT = __i; __i += 1
 PUNCT = __i; __i += 1
 SPACE = __i; __i += 1
 LOWER = __i; __i += 1
 UPPER = __i; __i += 1
 TITLE = __i; __i += 1
 ASCII = __i; __i += 1
 OFT_LOWER = __i; __i += 1 
 OFT_UPPER = __i; __i += 1
 OFT_TITLE = __i; __i += 1
 PUNCT = __i; __i += 1
 CONJ = __i; __i += 1
 NUM = __i; __i += 1
 X = __i; __i += 1
 DET = __i; __i += 1
 ADP = __i; __i += 1
 ADJ = __i; __i += 1
 ADV = __i; __i += 1
 VERB = __i; __i += 1
 NOUN = __i; __i += 1
 PDT = __i; __i += 1
 POS = __i; __i += 1
 PRON = __i; __i += 1
 PRT = __i; __i += 1
 # These are for the string views
 __i = 0
 SIC = __i; __i += 1
 CANON_CASED = __i; __i += 1
 NON_SPARSE = __i; __i += 1
 SHAPE = __i; __i += 1
 NR_STRING_VIEWS = __i
 def get_string_views(unicode string, lexeme):
    views = ['' for _ in range(NR_STRING_VIEWS)]
    views[SIC] = string
    views[CANON_CASED] = canonicalize_case(string, lexeme)
    views[SHAPE] = get_string_shape(string)
    views[NON_SPARSE] = get_non_sparse(string, views[CANON_CASED], views[SHAPE],
                                       lexeme)
    return views
 def set_orth_flags(unicode string, flags_t flags)
    setters = [
        (ALPHA, is_alpha),
        (DIGIT, is_digit),
        (PUNCT, is_punct),
        (SPACE, is_space),
        (LOWER, is_lower),
        (UPPER, is_upper),
        (SPACE, is_space)
    ]
    for bit, setter in setters:
        if setter(string):
            flags |= 1 << bit
    return flags
 cdef class English(spacy.Language):
-    cdef LatinWord new_lexeme(self, unicode string):
+    cdef Lexeme new_lexeme(self, unicode string, cluster=0, prob=0, case_stats=None,
-        return LatinWord(string)
+                           tag_freqs=None):
        return Lexeme(s, length, views, prob=prob, cluster=cluster,
                      flags=self.get_flags(string))
    cdef int find_split(self, unicode word):
        cdef size_t length = len(word)
@ -101,7 +177,7 @@ cpdef list tokenize(unicode string):
    return EN.tokenize(string)
-cpdef Word lookup(unicode string):
+cpdef Lexeme lookup(unicode string):
    """Retrieve (or create, if not found) a Lexeme for a string, and return its ID.
    Properties of the Lexeme are accessed by passing LexID to the accessor methods.
@ -116,23 +192,6 @@ cpdef Word lookup(unicode string):
    return EN.lookup(string)
 cpdef unicode unhash(StringHash hash_value):
    """Retrieve a string from a hash value. Mostly used for testing.
    In general you should avoid computing with strings, as they are slower than
    the intended ID-based usage. However, strings can be recovered if necessary,
    although no control is taken for hash collisions.
    Args:
        hash_value (StringHash): The hash of a string, returned by Python's hash()
        function.
    Returns:
        string (unicode): A unicode string that hashes to the hash_value.
    """
    return EN.unhash(hash_value)
 def add_string_views(view_funcs):
    """Add a string view to existing and previous lexical entries.
@ -150,16 +209,19 @@ def load_clusters(location):
    """
    pass
 def load_unigram_probs(location):
    """Load unigram probabilities.
    """
    pass
 def load_case_stats(location):
    """Load case stats.
    """
    pass
 def load_tag_stats(location):
    """Load tag statistics.
    """
--- a/spacy/spacy.pxd
+++ b/spacy/spacy.pxd
@ -1,16 +1,12 @@
 from libc.stdint cimport uint32_t
 from libc.stdint cimport uint64_t
-from spacy.word cimport Word
+from spacy.word cimport Lexeme
 ctypedef uint32_t StringHash
 cdef class Language:
    cdef object name
-    cdef dict chunks
+    cdef dict blobs
-    cdef dict vocab
+    cdef dict lexicon
    cdef dict bacov
    cpdef list tokenize(self, unicode text)
@ -20,8 +16,5 @@ cdef class Language:
    cdef list new_chunk(self, unicode string, list substrings)
    cdef Word new_lexeme(self, unicode lex)
    cpdef unicode unhash(self, StringHash hashed)
    cpdef list find_substrings(self, unicode chunk)
    cdef int find_split(self, unicode word)
    cdef int set_orth(self, unicode string, Word word)
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@ -15,16 +15,13 @@ from libc.stdlib cimport calloc, free
 from . import util
 from os import path
 TAGS = {}
 DIST_FLAGS = {}
 cdef class Language:
    view_funcs = []
    def __cinit__(self, name):
        self.name = name
-        self.bacov = {}
+        self.blobs = {}
-        self.chunks = {}
+        self.lexicon = {}
        self.vocab = {}
        self.load_tokenization(util.read_tokenization(name))
        self.load_dist_info(util.read_dist_info(name))
@ -37,26 +34,26 @@ cdef class Language:
            string (unicode): The string to split.
        Returns:
-            tokens (Tokens): A Tokens object.
+            tokens (list): A list of Lexeme objects.
        """
-        cdef list chunk
+        cdef list blob
        cdef list tokens = []
        cdef size_t length = len(string)
        cdef size_t start = 0
        cdef size_t i = 0
        for c in string:
-            if _is_whitespace(c):
+            if c == ' ':
                if start < i:
-                    chunk = self.lookup_chunk(string[start:i])
+                    blob = self.lookup_blob(string[start:i])
-                    tokens.extend(chunk)
+                    tokens.extend(blob)
                start = i + 1
            i += 1
        if start < i:
-            chunk = self.lookup_chunk(string[start:])
+            chunk = self.lookup_blob(string[start:])
            tokens.extend(chunk)
        return tokens
-    cdef Word lookup(self, unicode string):
+    cdef Lexeme lookup(self, unicode string):
        assert len(string) != 0
        cdef Word word 
        if string in self.vocab:
@ -65,28 +62,26 @@ cdef class Language:
            word = self.new_lexeme(string)
        return word
-    cdef list lookup_chunk(self, unicode string):
+    cdef list lookup_blob(self, unicode string):
        cdef list chunk
-        cdef size_t chunk_id
+        cdef size_t blob_id
-        if string in self.chunks:
+        if string in self.blobs:
-            chunk = self.chunks[string]
+            blob = self.blobs[string]
        else:
-            chunk = self.new_chunk(string, self.find_substrings(string))
+            blob = self.new_blob(string, self.find_substrings(string))
        return chunk
-    cdef list new_chunk(self, unicode string, list substrings):
+    cdef list new_blob(self, unicode string, list substrings):
-        chunk = []
+        blob = []
        for i, substring in enumerate(substrings):
-            chunk.append(self.lookup(substring))
+            blob.append(self.lookup(substring))
-        self.chunks[string] = chunk
+        self.blobs[string] = chunk
-        return chunk
+        return blob
    cdef Word new_lexeme(self, unicode string):
-        string_views = [view_func(string) for view_func in self.view_funcs]
+        # TODO
-        word = Word(string.encode('utf8'), string_views)
+        #lexeme = Lexeme(string.encode('utf8'), string_views)
-        self.bacov[word.lex] = string
+        #return lexeme
        self.vocab[string] = word
        return word
    """
    def add_view_funcs(self, list view_funcs):
@ -112,11 +107,7 @@ cdef class Language:
                self.bacov[hashed] = view
    """
-    cpdef unicode unhash(self, StringHash hash_value):
+    cpdef list find_substrings(self, unicode blob):
        '''Fetch a string from the reverse index, given its hash value.'''
        return self.bacov[hash_value]
    cpdef list find_substrings(self, unicode chunk):
        """Find how to split a chunk into substrings.
        This method calls find_split repeatedly. Most languages will want to
@ -129,21 +120,18 @@ cdef class Language:
            substrings (list): The component substrings, e.g. [u"Mike", "'s", "!"].
        """
        substrings = []
-        while chunk:
+        while blob:
-            split = self.find_split(chunk)
+            split = self.find_split(blob)
            if split == 0:
-                substrings.append(chunk)
+                substrings.append(blob)
                break
-            substrings.append(chunk[:split])
+            substrings.append(blob[:split])
-            chunk = chunk[split:]
+            blob = blob[split:]
        return substrings
    cdef int find_split(self, unicode word):
        return len(word)
    cdef int set_orth(self, unicode string, Word word):
        pass
    def load_tokenization(self, token_rules):
        '''Load special-case tokenization rules.
@ -178,22 +166,3 @@ cdef class Language:
                w.dist_flags |= DIST_FLAGS[flag]
            for tag in word_dist.tagdict:
                w.possible_tags |= TAGS[tag]
 cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
    if c == ' ':
        return True
    elif c == '\n':
        return True
    elif c == '\t':
        return True
    else:
        return False
 #cdef inline int _extend(Tokens tokens, Lexeme** chunk) nogil:
 #    cdef size_t i = 0
 #    while chunk[i] != NULL:
 #        tokens.vctr[0].push_back(<Lexeme_addr>chunk[i])
 #        tokens.length += 1
 #        i += 1
--- a/spacy/orthography/init.pxd
+++ b/spacy/orthography/init.pxd
--- a/spacy/orthography/init.py
+++ b/spacy/orthography/init.py
--- a/spacy/orthography/init.pyx
+++ b/spacy/orthography/init.pyx
--- a/spacy/orthography/latin.pxd
+++ b/spacy/orthography/latin.pxd
@ -1,32 +0,0 @@
 cdef enum OrthFlag:
    IS_ALPHA
    IS_DIGIT
    IS_PUNCT
    IS_SPACE
    IS_LOWER
    IS_UPPER
    IS_TITLE
    IS_ASCII
 cdef enum:
    NORM
    SHAPE
    LAST3
 from spacy.lexeme cimport LexID
 from spacy.lexeme cimport StringHash
 cpdef bint is_alpha(LexID lex_id) except *
 cpdef bint is_digit(LexID lex_id) except *
 cpdef bint is_punct(LexID lex_id) except *
 cpdef bint is_space(LexID lex_id) except *
 cpdef bint is_lower(LexID lex_id) except *
 cpdef bint is_upper(LexID lex_id) except *
 cpdef bint is_title(LexID lex_id) except *
 cpdef bint is_ascii(LexID lex_id) except *
 cpdef StringHash norm_of(LexID lex_id) except 0
 cpdef StringHash shape_of(LexID lex_id) except 0
 cpdef StringHash last3_of(LexID lex_id) except 0
--- a/spacy/orthography/latin.pyx
+++ b/spacy/orthography/latin.pyx
@ -1,211 +0,0 @@
 # cython: embedsignature=True
 from __future__ import unicode_literals
 from spacy.lexeme cimport Lexeme
 def get_normalized(unicode word):
    """Todo.
    Args:
        word (unicode)
    Returns:
        normalized (unicode)
    """
    if word.isalpha() and word.islower():
        return word
    else:
        return get_word_shape(word)
 def get_word_shape(unicode word):
    """Todo.
    Args:
        word (unicode)
    Returns:
        shape (unicode)
    """
    cdef size_t length = len(word)
    shape = ""
    last = ""
    shape_char = ""
    seq = 0
    for c in word:
        if c.isalpha():
            if c.isupper():
                shape_char = "X"
            else:
                shape_char = "x"
        elif c.isdigit():
            shape_char = "d"
        else:
            shape_char = c
        if shape_char == last:
            seq += 1
        else:
            seq = 0
            last = shape_char
        if seq < 3:
            shape += shape_char
    assert shape
    return shape
 cpdef unicode get_last3(unicode string):
    return string[-3:]
 cpdef bint is_alpha(LexID lex_id) except *:
    """Check whether all characters in the word's string are alphabetic.
    Should match the :py:func:`unicode.isalpha()` function.
    >>> is_alpha(lookup(u'Hello'))
    True
    >>> is_alpha(lookup(u'العرب'))
    True
    >>> is_alpha(lookup(u'10'))
    False
    """
    return (<Lexeme*>lex_id).orth_flags & 1 << IS_ALPHA
 cpdef bint is_digit(LexID lex_id) except *:
    """Check whether all characters in the word's string are numeric.
    Should match the :py:func:`unicode.isdigit()` function.
    >>> is_digit(lookup(u'10'))
    True
    >>> is_digit(lookup(u'๐'))
    True
    >>> is_digit(lookup(u'one'))
    False
    """
    return (<Lexeme*>lex_id).orth_flags & 1 << IS_DIGIT
 cpdef bint is_punct(LexID lex_id) except *:
    """Check whether all characters belong to a punctuation unicode data category
    for a Lexeme ID.
    >>> is_punct(lookup(u'.'))
    True
    >>> is_punct(lookup(u'⁒'))
    True
    >>> is_punct(lookup(u' '))
    False
    """
    return (<Lexeme*>lex_id).orth_flags & 1 << IS_PUNCT
 cpdef bint is_space(LexID lex_id) except *:
    """Give the result of unicode.isspace() for a Lexeme ID.
    >>> is_space(lookup(u'\\t'))
    True
    >>> is_space(lookup(u'<unicode space>'))
    True
    >>> is_space(lookup(u'Hi\\n'))
    False
    """
    return (<Lexeme*>lex_id).orth_flags & 1 << IS_SPACE
 cpdef bint is_lower(LexID lex_id) except *:
    """Give the result of unicode.islower() for a Lexeme ID.
    >>> is_lower(lookup(u'hi'))
    True
    >>> is_lower(lookup(<unicode>))
    True
    >>> is_lower(lookup(u'10'))
    False
    """
    return (<Lexeme*>lex_id).orth_flags & 1 << IS_LOWER
 cpdef bint is_upper(LexID lex_id) except *:
    """Give the result of unicode.isupper() for a Lexeme ID.
    >>> is_upper(lookup(u'HI'))
    True
    >>> is_upper(lookup(u'H10'))
    True
    >>> is_upper(lookup(u'10'))
    False
    """
    return (<Lexeme*>lex_id).orth_flags & 1 << IS_UPPER
 cpdef bint is_title(LexID lex_id) except *:
    """Give the result of unicode.istitle() for a Lexeme ID.
    >>> is_title(lookup(u'Hi'))
    True
    >>> is_title(lookup(u'Hi1'))
    True
    >>> is_title(lookup(u'1'))
    False
    """
    return (<Lexeme*>lex_id).orth_flags & 1 << IS_TITLE
 cpdef bint is_ascii(LexID lex_id) except *:
    """Give the result of checking whether all characters in the string are ascii.
    >>> is_ascii(lookup(u'Hi'))
    True
    >>> is_ascii(lookup(u' '))
    True
    >>> is_title(lookup(u'<unicode>'))
    False
    """
    return (<Lexeme*>lex_id).orth_flags & 1 << IS_ASCII
 cpdef StringHash norm_of(LexID lex_id) except 0:
    """Return the hash of a "normalized" version of the string.
    Normalized strings are intended to be less sparse, while still capturing
    important lexical information.  See :py:func:`spacy.latin.orthography.normalize_string`
    for details of the normalization function.
    >>> unhash(norm_of(lookupu'Hi'))
    u'hi'
    >>> unhash(norm_of(lookup(u'255667')))
    u'shape=dddd'
    >>> unhash(norm_of(lookup(u'...')))
    u'...'
    """
    return (<Lexeme*>lex_id).string_views[NORM]
 cpdef StringHash shape_of(LexID lex_id) except 0:
    """Return the hash of a string describing the word's "orthograpgic shape".
    Orthographic shapes are calculated by the :py:func:`spacy.orthography.latin.string_shape`
    function. Word shape features have been found useful for NER and POS tagging,
    e.g. Manning (2011)
    >>> unhash(shape_of(lookupu'Hi'))
    u'Xx'
    >>> unhash(shape_of(lookup(u'255667')))
    u'dddd'
    >>> unhash(shape_of(lookup(u'...')))
    u'...'
    """
    cdef Lexeme* w = <Lexeme*>lex_id
    return w.string_views[SHAPE]
 cpdef StringHash last3_of(LexID lex_id) except 0:
    '''Return the hash of string[-3:], i.e. the last three characters of the word.
    >>> lex_ids = [lookup(w) for w in (u'Hello', u'!')]
    >>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
    [u'llo', u'!']
    '''
    return (<Lexeme*>lex_id).string_views[LAST3]
--- a/spacy/string_tools.pxd
+++ b/spacy/string_tools.pxd
@ -1,7 +0,0 @@
 cpdef bytes to_bytes(unicode string)
 cpdef unicode from_bytes(bytes string)
 cpdef unicode substr(unicode string, int start, int end, size_t length)
 cdef bint is_whitespace(Py_UNICODE c)
--- a/spacy/string_tools.pyx
+++ b/spacy/string_tools.pyx
@ -1,35 +0,0 @@
 # cython: profile=True
 cpdef bytes to_bytes(unicode string):
    return string.encode('utf8')
 cpdef unicode from_bytes(bytes string):
    return string.decode('utf8')
 cpdef unicode substr(unicode string, int start, int end, size_t length):
    if end >= length:
        end = -1
    if start >= length:
        start = 0
    if start <= 0 and end < 0:
        return string
    elif start < 0:
        start = 0
    elif end < 0:
        end = length
    return string[start:end]
 cdef bint is_whitespace(Py_UNICODE c):
    # TODO: Support other unicode spaces
    # https://www.cs.tut.fi/~jkorpela/chars/spaces.html
    if c == u' ':
        return True
    elif c == u'\n':
        return True
    elif c == u'\t':
        return True
    else:
        return False
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -1,18 +0,0 @@
 from libcpp.vector cimport vector
 from spacy.lexeme cimport LexID
 from spacy.lexeme cimport Lexeme
 from cython.operator cimport dereference as deref
 from spacy.spacy cimport Language
 cdef class Tokens:
    cdef Language lang
    cdef vector[LexID]* vctr
    cdef size_t length
    cpdef int append(self, LexID token)
    cpdef int extend(self, Tokens other) except -1
    cpdef object group_by(self, size_t attr)
    cpdef dict count_by(self, size_t attr)
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -1,92 +0,0 @@
 from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as inc
 from spacy.lexeme cimport Lexeme
 from spacy.spacy cimport StringHash
 cdef class Tokens:
    def __cinit__(self, Language lang):
        self.lang = lang
        self.vctr = new vector[LexID]()
        self.length = 0
    def __dealloc__(self):
        del self.vctr
    def __iter__(self):
        cdef vector[LexID].iterator it = self.vctr[0].begin()
        while it != self.vctr[0].end():
            yield deref(it)
            inc(it)
    def __getitem__(self, size_t idx):
        return self.vctr[0].at(idx)
    def __len__(self):
        return self.length
    cpdef int append(self, LexID token):
        self.vctr[0].push_back(token)
        self.length += 1
    cpdef int extend(self, Tokens other) except -1:
        cdef LexID el
        for el in other:
            self.append(el)
    cpdef object group_by(self, size_t view_idx):
        '''Group tokens that share the property attr into Tokens instances, and
        return a list of them. Returns a tuple of three lists:
        (string names, hashes, tokens)
        The lists are aligned, so the ith entry in string names is the string
        that the ith entry in hashes unhashes to, which the Tokens instance
        is grouped by.
        You can then use count_by or group_by on the Tokens
        for further processing. Calling group_by and then asking the length
        of the Tokens objects is equivalent to count_by, but somewhat slower.
        '''
        # Implementation here is working around some of the constraints in
        # Cython about what type of thing can go in what type of container.
        # Long story short, it's pretty hard to get a Python object like
        # Tokens into a vector or array. If we really need this to run faster,
        # we can be tricky and get the Python list access out of the loop. What
        # we'd do is store pointers to the underlying vectors.
        # So far, speed isn't mattering here.
        cdef dict indices = {}
        cdef list groups = []
        cdef list names = []
        cdef list hashes = []
        cdef StringHash key
        cdef LexID t
        for t in self.vctr[0]:
            if view_idx == 0:
                key = (<Lexeme*>t).lex
            else:
                key = (<Lexeme*>t).string_views[view_idx - 1]
            if key in indices:
                groups[indices[key]].append(t)
            else:
                indices[key] = len(groups)
                groups.append(Tokens(self.lang))
                names.append(self.lang.unhash(key))
                hashes.append(key)
                groups[-1].append(t)
        return names, hashes, groups
    cpdef dict count_by(self, size_t attr):
        counts = {}
        cdef LexID t
        cdef StringHash key
        for t in self.vctr[0]:
            #key = attr_of(t, attr)
            key = 0
            if key not in counts:
                counts[key] = 0
            counts[key] += 1
        return counts
--- a/spacy/word.pxd
+++ b/spacy/word.pxd
@ -1,59 +1,25 @@
-from libc.stdint cimport uint32_t
+from .typedefs cimport hash_t, utf8_t, flag_t, id_t
 from libc.stdint cimport uint64_t
 ctypedef int ClusterID
 ctypedef uint32_t StringHash
 ctypedef size_t LexID
 ctypedef char OrthFlags
 ctypedef char DistFlags
 ctypedef uint64_t TagFlags
-cdef enum OrthFlag:
+DEF MAX_FLAG = 64
    IS_ALPHA
    IS_DIGIT
    IS_PUNCT
    IS_SPACE
    IS_LOWER
    IS_UPPER
    IS_TITLE
    IS_ASCII
-cdef enum:
+cdef class Lexeme:
    NORM
    SHAPE
    LAST3
 cdef class Word:
    # NB: the readonly keyword refers to _Python_ access. The attributes are
    # writeable from Cython.
-    cdef readonly StringHash key
+    cdef readonly id_t id
    cdef readonly char** utf8_strings
    cdef readonly size_t length
    cdef readonly double prob
-    cdef readonly ClusterID cluster
+    cdef readonly size_t cluster
    cdef readonly TagFlags possible_tags
    cdef readonly DistFlags dist_flags
    cdef readonly OrthFlags orth_flags
-    cpdef StringHash get_view(self, size_t i) except 0
+    cdef readonly utf8_t* strings
    cdef readonly size_t nr_strings
    cdef readonly flag_t flags
-cdef class CasedWord(Word):
+    cpdef bint check_flag(self, size_t flag_id) except *
-    cpdef bint can_tag(self, TagFlags flag) except *
+    cpdef int set_flag(self, size_t flag_id) except -1
-    cpdef bint check_dist_flag(self, DistFlags flag) except *
+    
-    cpdef bint check_orth_flag(self, OrthFlags flag) except *
+    cpdef unicode get_string(self, size_t i) except *
-
+    cpdef id_t get_id(self, size_t i) except 0
-    cpdef bint is_often_titled(self) except *
+    cpdef int add_strings(self, list strings) except -1
    cpdef bint is_often_uppered(self) except *
    cpdef bint is_alpha(self) except *
    cpdef bint is_digit(self) except *
    cpdef bint is_punct(self) except *
    cpdef bint is_space(self) except *
    cpdef bint is_lower(self) except *
    cpdef bint is_upper(self) except *
    cpdef bint is_title(self) except *
    cpdef bint is_ascii(self) except *
--- a/spacy/word.pyx
+++ b/spacy/word.pyx
@ -4,40 +4,32 @@
 from libc.stdlib cimport calloc, free
-
+from spacy cimport flags
 # Python-visible enum for POS tags
 PUNCT = 0
 CONJ = 1
 NUM = 2
 X = 3
 DET = 4
 ADP = 5
 ADJ = 6
 ADV = 7
 VERB = 8
 NOUN = 9
 PDT = 10
 POS = 11
 PRON = 12
 PRT = 13
-DEF OFT_UPPER = 1
+cdef class Lexeme:
 DEF OFT_TITLE = 2
 cdef class Word:
    """A lexical type.
    Clients should avoid instantiating Lexemes directly, and instead use get_lexeme
    from a language module, e.g. spacy.en.get_lexeme . This allows us to use only
    one Lexeme object per lexical type.
    Attributes:
-        string (bytes):
+        id (view_id_t):
-            A utf8-encoded byte-string for the word.
+            A unique ID of the word's string.
-        
+
-        lex (StringHash):
+            Implemented as the memory-address of the string,
-            A hash of the word.
+            as we use Python's string interning to guarantee that only one copy
            of each string is seen.
        string (unicode):
            The unicode string.
            Implemented as a property; relatively expensive.
        length (size_t):
-            The (unicode) length of the word.
+            The number of unicode code-points in the string.
-        
+
        prob (double):
            An estimate of the word's unigram log probability.
@ -60,186 +52,194 @@ cdef class Word:
            while "dapple" is totally different. On the other hand, "scalable" receives
            the same cluster ID as "pineapple", which is not what we'd like.
    """
-    def __cinit__(self, bytes string, list string_views, prob=0.0, cluster=0,
+    def __cinit__(self, utf8_t string, size_t length, list views, prob=0.0,
-                  orth_flags=0, dist_flags=0, possible_tags=0):
+                  cluster=0, orth_flags=0, dist_flags=0, possible_tags=0):
-        self.string = <char*>string
+        self.id = <id_t>&string
-        self.length = len(string)
+        self.length = length
-        self.views = <char**>calloc(len(string_views), sizeof(StringHash))
+        self.nr_strings = 0
-        cdef unicode view
+        self.add_views(views)
        for i in range(len(string_views)):
            view = string_views[i]
            self.string_views[i] = hash(view)
    def __dealloc__(self):
-        free(self.string_views)
+        free(self.views)
-    cpdef StringHash get_view(self, size_t i) except 0:
+    property string:
-        return self.string_views[i]
+        def __get__(self):
            return self.strings[0].decode('utf8')
-    cpdef bint check_orth_flag(self, OrthFlags flag) except *:
+    cpdef unicode get_view_string(self, size_t i) except *:
-        """Access the value of one of the pre-computed boolean orthographic features.
+        assert i < self.nr_strings
        return self.strings[i].decode('utf8')
-        Meanings depend on the language-specific orthographic features being loaded.
+    cpdef intptr_t get_view_id(self, size_t i) except 0:
-        The suggested features for latin-alphabet languages are: TODO
+        assert i < self.nr_strings
-        """
+        return <string_id_t>&self.views[i]
        return self.orth_flags & (1 << flag)
-    cpdef bint check_dist_flag(self, DistFlags flag) except *:
+    cpdef int add_views(self, list views) except -1:
        self.nr_views += len(strings)
        self.views = <char**>realloc(self.views, self.nr_views * sizeof(utf8_t))
        cdef unicode view
        cdef bytes utf8_string
        for i, view in enumerate(strings):
            view = string_views[i]
            utf8_string = view.encode('utf8')
            # Intern strings, allowing pointer comparison
            utf8_string = intern(utf8_string)
            self.views[i] = utf8_string
    cpdef bint check_flag(self, size_t flag_id) except *:
        """Access the value of one of the pre-computed boolean distribution features.
        Meanings depend on the language-specific distributional features being loaded.
        The suggested features for latin-alphabet languages are: TODO
        """
- 
+        assert flag_id < flags.MAX_FLAG
-        return self.dist_flags & (1 << flag)
+        return self.flags & (1 << flag_id)
-    cpdef bint can_tag(self, TagFlags flag) except *:
+    cpdef int set_flag(self, size_t flag_id) except -1:
-        """Check whether the word often receives a particular tag in a large text
+        assert flag_id < flags.MAX_FLAG
-        corpus. "Often" is chosen by heuristic.
+        self.flags |= (1 << flag_id)
        """
        return self.possible_tags & (1 << flag)
-cdef class CasedWord(Word):
+#
-    def __cinit__(self, bytes string):
+#cdef class CasedWord(Word):
-        string_views = [get_normaized(string), get_word_shape(string), string[-3:]]
+#    def __cinit__(self, bytes string, list views):
-        Word.__cinit__(self, string, string_views)
+#        Word.__cinit__(self, string, string_views)
-    
+#    
-    cpdef bint is_often_uppered(self) except *:
+#    cpdef bint is_often_uppered(self) except *:
-        '''Check the OFT_UPPER distributional flag for the word.
+#        '''Check the OFT_UPPER distributional flag for the word.
-    
+#    
-        The OFT_UPPER flag records whether a lower-cased version of the word
+#        The OFT_UPPER flag records whether a lower-cased version of the word
-        is found in all-upper case frequently in a large sample of text, where
+#        is found in all-upper case frequently in a large sample of text, where
-        "frequently" is defined as P >= 0.95 (chosen for high mutual information for
+#        "frequently" is defined as P >= 0.95 (chosen for high mutual information for
-        POS tagging).
+#        POS tagging).
-    
+#    
-        Case statistics are estimated from a large text corpus. Estimates are read
+#        Case statistics are estimated from a large text corpus. Estimates are read
-        from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
+#        from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
-    
+#    
-        >>> is_often_uppered(lookup(u'nato'))
+#        >>> is_often_uppered(lookup(u'nato'))
-        True
+#        True
-        >>> is_often_uppered(lookup(u'the')) 
+#        >>> is_often_uppered(lookup(u'the')) 
-        False
+#        False
-        '''
+#        '''
-        return self.dist_flags & (1 << OFT_UPPER)
+#        return self.dist_flags & (1 << OFT_UPPER)
-
+#
-
+#
-    cpdef bint is_often_titled(self) except *:
+#    cpdef bint is_often_titled(self) except *:
-        '''Check the OFT_TITLE distributional flag for the word.
+#        '''Check the OFT_TITLE distributional flag for the word.
-    
+#    
-        The OFT_TITLE flag records whether a lower-cased version of the word
+#        The OFT_TITLE flag records whether a lower-cased version of the word
-        is found title-cased (see string.istitle) frequently in a large sample of text,
+#        is found title-cased (see string.istitle) frequently in a large sample of text,
-        where "frequently" is defined as P >= 0.3 (chosen for high mutual information for
+#        where "frequently" is defined as P >= 0.3 (chosen for high mutual information for
-        POS tagging).
+#        POS tagging).
-    
+#    
-        Case statistics are estimated from a large text corpus. Estimates are read
+#        Case statistics are estimated from a large text corpus. Estimates are read
-        from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
+#        from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
-    
+#    
-        >>> is_oft_upper(lookup(u'john'))
+#        >>> is_oft_upper(lookup(u'john'))
-        True
+#        True
-        >>> is_oft_upper(lookup(u'Bill')) 
+#        >>> is_oft_upper(lookup(u'Bill')) 
-        False
+#        False
-        '''
+#        '''
-        return self.dist_flags & (1 << OFT_TITLE)
+#        return self.dist_flags & (1 << OFT_TITLE)
-
+#
-
+#
-    cpdef bint is_alpha(self) except *:
+#    cpdef bint is_alpha(self) except *:
-        """Check whether all characters in the word's string are alphabetic.
+#        """Check whether all characters in the word's string are alphabetic.
-        
+#        
-        Should match the :py:func:`unicode.isalpha()` function.
+#        Should match the :py:func:`unicode.isalpha()` function.
-
+#
-        >>> is_alpha(lookup(u'Hello'))
+#        >>> is_alpha(lookup(u'Hello'))
-        True
+#        True
-        >>> is_alpha(lookup(u'العرب'))
+#        >>> is_alpha(lookup(u'العرب'))
-        True
+#        True
-        >>> is_alpha(lookup(u'10'))
+#        >>> is_alpha(lookup(u'10'))
-        False
+#        False
-        """
+#        """
-        return self.orth_flags & 1 << IS_ALPHA
+#        return self.orth_flags & 1 << IS_ALPHA
-
+#
-    cpdef bint is_digit(self) except *:
+#    cpdef bint is_digit(self) except *:
-        """Check whether all characters in the word's string are numeric.
+#        """Check whether all characters in the word's string are numeric.
-    
+#    
-        Should match the :py:func:`unicode.isdigit()` function.
+#        Should match the :py:func:`unicode.isdigit()` function.
-
+#
-        >>> is_digit(lookup(u'10'))
+#        >>> is_digit(lookup(u'10'))
-        True
+#        True
-        >>> is_digit(lookup(u'๐'))
+#        >>> is_digit(lookup(u'๐'))
-        True
+#        True
-        >>> is_digit(lookup(u'one'))
+#        >>> is_digit(lookup(u'one'))
-        False
+#        False
-        """
+#        """
-        return self.orth_flags & 1 << IS_DIGIT
+#        return self.orth_flags & 1 << IS_DIGIT
-
+#
-    cpdef bint is_punct(self) except *:
+#    cpdef bint is_punct(self) except *:
-        """Check whether all characters belong to a punctuation unicode data category
+#        """Check whether all characters belong to a punctuation unicode data category
-        for a Lexeme ID.
+#        for a Lexeme ID.
-
+#
-        >>> is_punct(lookup(u'.'))
+#        >>> is_punct(lookup(u'.'))
-        True
+#        True
-        >>> is_punct(lookup(u'⁒'))
+#        >>> is_punct(lookup(u'⁒'))
-        True
+#        True
-        >>> is_punct(lookup(u' '))
+#        >>> is_punct(lookup(u' '))
-        False
+#        False
-        """
+#        """
-        return self.orth_flags & 1 << IS_PUNCT
+#        return self.orth_flags & 1 << IS_PUNCT
-
+#
-    cpdef bint is_space(self) except *:
+#    cpdef bint is_space(self) except *:
-        """Give the result of unicode.isspace() for a Lexeme ID.
+#        """Give the result of unicode.isspace() for a Lexeme ID.
-
+#
-        >>> is_space(lookup(u'\\t'))
+#        >>> is_space(lookup(u'\\t'))
-        True
+#        True
-        >>> is_space(lookup(u'<unicode space>'))
+#        >>> is_space(lookup(u'<unicode space>'))
-        True
+#        True
-        >>> is_space(lookup(u'Hi\\n'))
+#        >>> is_space(lookup(u'Hi\\n'))
-        False
+#        False
-        """
+#        """
-        return self.orth_flags & 1 << IS_SPACE
+#        return self.orth_flags & 1 << IS_SPACE
-
+#
-    cpdef bint is_lower(self) except *:
+#    cpdef bint is_lower(self) except *:
-        """Give the result of unicode.islower() for a Lexeme ID.
+#        """Give the result of unicode.islower() for a Lexeme ID.
-
+#
-        >>> is_lower(lookup(u'hi'))
+#        >>> is_lower(lookup(u'hi'))
-        True
+#        True
-        >>> is_lower(lookup(<unicode>))
+#        >>> is_lower(lookup(<unicode>))
-        True
+#        True
-        >>> is_lower(lookup(u'10'))
+#        >>> is_lower(lookup(u'10'))
-        False
+#        False
-        """
+#        """
-        return self.orth_flags & 1 << IS_LOWER
+#        return self.orth_flags & 1 << IS_LOWER
-
+#
-    cpdef bint is_upper(self) except *:
+#    cpdef bint is_upper(self) except *:
-        """Give the result of unicode.isupper() for a Lexeme ID.
+#        """Give the result of unicode.isupper() for a Lexeme ID.
-
+#
-        >>> is_upper(lookup(u'HI'))
+#        >>> is_upper(lookup(u'HI'))
-        True
+#        True
-        >>> is_upper(lookup(u'H10'))
+#        >>> is_upper(lookup(u'H10'))
-        True
+#        True
-        >>> is_upper(lookup(u'10'))
+#        >>> is_upper(lookup(u'10'))
-        False
+#        False
-        """
+#        """
-        return self.orth_flags & 1 << IS_UPPER
+#        return self.orth_flags & 1 << IS_UPPER
-
+#
-    cpdef bint is_title(self) except *:
+#    cpdef bint is_title(self) except *:
-        """Give the result of unicode.istitle() for a Lexeme ID.
+#        """Give the result of unicode.istitle() for a Lexeme ID.
-
+#
-        >>> is_title(lookup(u'Hi'))
+#        >>> is_title(lookup(u'Hi'))
-        True
+#        True
-        >>> is_title(lookup(u'Hi1'))
+#        >>> is_title(lookup(u'Hi1'))
-        True
+#        True
-        >>> is_title(lookup(u'1'))
+#        >>> is_title(lookup(u'1'))
-        False
+#        False
-        """
+#        """
-        return self.orth_flags & 1 << IS_TITLE
+#        return self.orth_flags & 1 << IS_TITLE
-
+#
-    cpdef bint is_ascii(self) except *:
+#    cpdef bint is_ascii(self) except *:
-        """Give the result of checking whether all characters in the string are ascii.
+#        """Give the result of checking whether all characters in the string are ascii.
-
+#
-        >>> is_ascii(lookup(u'Hi'))
+#        >>> is_ascii(lookup(u'Hi'))
-        True
+#        True
-        >>> is_ascii(lookup(u' '))
+#        >>> is_ascii(lookup(u' '))
-        True
+#        True
-        >>> is_title(lookup(u'<unicode>'))
+#        >>> is_title(lookup(u'<unicode>'))
-        False
+#        False
-        """
+#        """
-        return self.orth_flags & 1 << IS_ASCII
+#        return self.orth_flags & 1 << IS_ASCII