spaCy/spacy/lang.pyx

# cython: profile=True
# cython: embedsignature=True
"""Common classes and utilities across languages.

Provides the main implementation for the spacy tokenizer. Specific languages
subclass the Language class, over-writing the tokenization rules as necessary.
Special-case tokenization rules are read from data/<lang>/tokenization .
"""
from __future__ import unicode_literals

from libc.stdlib cimport calloc, free

import json
import random
from os import path

from .util import read_lang_data
from spacy.tokens import Tokens
from spacy.lexeme cimport LexemeC, lexeme_init
from murmurhash.mrmr cimport hash64


cdef class Language:
    """Base class for language-specific tokenizers.

    Most subclasses will override the _split or _split_one methods, which take
    a string of non-whitespace characters and output a list of strings.  This
    function is called by _tokenize, which sits behind a cache and turns the
    list of strings into Lexeme objects via the Lexicon. Most languages will not
    need to override _tokenize or tokenize.

    The language is supplied a list of boolean functions, used to compute flag
    features. These are passed to the language's Lexicon object.

    The language's name is used to look up default data-files, found in data/<name.
    """
    def __cinit__(self, name, string_features, flag_features):
        if flag_features is None:
            flag_features = []
        if string_features is None:
            string_features = []
        self.name = name
        self.cache.set_empty_key(0)
        self.specials.set_empty_key(0)
        lang_data = read_lang_data(name)
        rules, words, probs, clusters, case_stats, tag_stats = lang_data
        self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
                               string_features, flag_features)
        self._load_special_tokenization(rules)
        self.tokens_class = Tokens

    def __dealloc__(self):
        cdef uint64_t hashed
        cdef size_t lex_addr
        for (hashed, lex_addr) in self.specials:
            free(<LexemeC*>lex_addr)

    property nr_types:
        def __get__(self):
            """Return the number of lexical types in the vocabulary"""
            return self.lexicon.size

    cpdef Lexeme lookup(self, unicode string):
        """Retrieve (or create, if not found) a Lexeme for a string, and return it.
    
        Args:
            string (unicode): The string to be looked up. Must be unicode, not bytes.

        Returns:
            lexeme (Lexeme): A reference to a lexical type.
        """
        return self.lexicon.lookup(string)

    cpdef Tokens tokenize(self, unicode string):
        """Tokenize a string.

        The tokenization rules are defined in two places:

        * The data/<lang>/tokenization table, which handles special cases like contractions;
        * The appropriate :py:meth:`find_split` function, which is used to split
          off punctuation etc.

        Args:
            string (unicode): The string to be tokenized. 

        Returns:
            tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
        """
        cdef size_t length = len(string)
        cdef Tokens tokens = self.tokens_class(length)
        if length == 0:
            return tokens

        cdef size_t start = 0
        cdef size_t i = 0
        cdef Py_UNICODE* chars = string
        cdef Py_UNICODE c
        cdef String span
        for i in range(length):
            c = chars[i]
            if Py_UNICODE_ISSPACE(c) == 1:
                if start < i:
                    string_from_slice(&span, chars, start, i)
                    self._tokenize(tokens, &span)
                start = i + 1
        i += 1
        if start < i:
            string_from_slice(&span, chars, start, i)
            self._tokenize(tokens, &span)
        return tokens

    cdef int _tokenize(self, Tokens tokens, String* string):
        cdef LexemeC** lexemes = <LexemeC**>self.cache[string.key]
        lexemes = <LexemeC**>self.cache[string.key]
        cdef size_t i
        if lexemes != NULL:
            i = 0
            while lexemes[i] != NULL:
                tokens.push_back(lexemes[i])
                i += 1
            return 0
        cdef uint64_t hashed = string.key

        cdef size_t first_token = tokens.length
        cdef int split
        cdef int remaining = string.n
        cdef String prefix
        while remaining >= 1:
            split = self._split_one(string.chars, string.n)
            remaining -= split
            string_slice_prefix(string, &prefix, split)
            lexemes = <LexemeC**>self.specials[prefix.key]
            if lexemes != NULL:
                i = 0
                while lexemes[i] != NULL:
                    tokens.push_back(lexemes[i])
                    i += 1
            else:
                tokens.push_back(<LexemeC*>self.lexicon.get(&prefix))
        lexemes = <LexemeC**>calloc(tokens.length - first_token, sizeof(LexemeC*))
        cdef size_t j
        for i, j in enumerate(range(first_token, tokens.length)):
            lexemes[i] = tokens.lexemes[j]
        self.cache[hashed] = <size_t>lexemes

    cdef int _split_one(self, Py_UNICODE* characters, size_t length):
        return length

    def _load_special_tokenization(self, token_rules):
        '''Load special-case tokenization rules.

        Loads special-case tokenization rules into the Language.cache cache,
        read from data/<lang>/tokenization . The special cases are loaded before
        any language data is tokenized, giving these priority.  For instance,
        the English tokenization rules map "ain't" to ["are", "not"].

        Args:
            token_rules (list): A list of (chunk, tokens) pairs, where chunk is
                a string and tokens is a list of strings.
        '''
        cdef LexemeC** lexemes
        cdef uint64_t hashed
        cdef String string
        for uni_string, substrings in token_rules:
            lexemes = <LexemeC**>calloc(len(substrings) + 1, sizeof(LexemeC*))
            for i, substring in enumerate(substrings):
                string_from_unicode(&string, substring)
                lexemes[i] = <LexemeC*>self.lexicon.get(&string)
            lexemes[i + 1] = NULL
            string_from_unicode(&string, uni_string)
            self.specials[string.key] = <size_t>lexemes
            self.cache[string.key] = <size_t>lexemes


cdef class Lexicon:
    def __cinit__(self, words, probs, clusters, case_stats, tag_stats,
                  string_features, flag_features):
        self._flag_features = flag_features
        self._string_features = string_features
        self._dict.set_empty_key(0)
        self.size = 0
        cdef Lexeme word
        for string in words:
            prob = probs.get(string, 0.0)
            cluster = clusters.get(string, 0.0)
            cases = case_stats.get(string, {})
            tags = tag_stats.get(string, {})
            views = [string_view(string, prob, cluster, cases, tags)
                     for string_view in self._string_features]
            flags = set()
            for i, flag_feature in enumerate(self._flag_features):
                if flag_feature(string, prob, cluster, cases, tags):
                    flags.add(i)
            lexeme = lexeme_init(string, prob, cluster, views, flags)
            self._dict[string] = <size_t>lexeme
            self.size += 1

    cdef size_t get(self, String* string):
        cdef LexemeC* lexeme = <LexemeC*>self._dict[string.key]
        if lexeme != NULL:
            return <size_t>lexeme
        
        cdef unicode uni_string = string.chars[:string.n]
        views = [string_view(uni_string, 0.0, 0, {}, {})
                 for string_view in self._string_features]
        flags = set()
        for i, flag_feature in enumerate(self._flag_features):
            if flag_feature(uni_string, 0.0, {}, {}):
                flags.add(i)
 
        lexeme = lexeme_init(uni_string, 0, 0, views, flags)
        self._dict[string.key] = <size_t>lexeme
        self.size += 1
        return <size_t>lexeme

    cpdef Lexeme lookup(self, unicode uni_string):
        """Retrieve (or create, if not found) a Lexeme for a string, and return it.
    
        Args
            string (unicode):  The string to be looked up. Must be unicode, not bytes.

        Returns:
            lexeme (Lexeme): A reference to a lexical type.
        """
        cdef String string
        string_from_unicode(&string, uni_string)
        cdef size_t lexeme = self.get(&string)
        return Lexeme(lexeme)


cdef void string_from_unicode(String* s, unicode uni):
    string_from_slice(s, <Py_UNICODE*>uni, 0, len(uni))


cdef inline void string_from_slice(String* s, Py_UNICODE* chars, size_t start, size_t end) nogil:
    s.chars = &chars[start]
    s.n = end - start
    s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)


cdef inline void string_slice_prefix(String* s, String* prefix, size_t n) nogil:
    string_from_slice(prefix, s.chars, 0, n)
    s.chars += n
    s.n -= n
    s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
* Fixed major efficiency problem, from not quite grokking pass by reference in cython c++ 2014-07-07 09:36:43 +04:00			`# cython: profile=True`
* Broken version being refactored for docs 2014-08-20 15:39:39 +04:00			`# cython: embedsignature=True`
			`"""Common classes and utilities across languages.`

			`Provides the main implementation for the spacy tokenizer. Specific languages`
			`subclass the Language class, over-writing the tokenization rules as necessary.`
			`Special-case tokenization rules are read from data/<lang>/tokenization .`
			`"""`
* Initial commit. Tests passing for punctuation handling. Need contractions, file transport, tokenize function, etc. 2014-07-05 22:51:42 +04:00			`from __future__ import unicode_literals`
* Reorganized, moving language-independent stuff to spacy. The functions in spacy ask for the dictionaries and split function on input, but the language-specific modules are curried versions that use the globals 2014-07-07 06:21:06 +04:00
* Switch to dynamically allocating array, based on the document length 2014-07-07 10:05:29 +04:00			`from libc.stdlib cimport calloc, free`

* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 19:15:39 +04:00			`import json`
* Improve cache mechanism by including a random element depending on the size of the cache. 2014-09-12 02:18:31 +04:00			`import random`
* Fixed major efficiency problem, from not quite grokking pass by reference in cython c++ 2014-07-07 09:36:43 +04:00			`from os import path`
* Progress to getting WordTree working. Tests pass, but so far it's slower. 2014-08-16 21:59:38 +04:00
* Redesign proceeding 2014-08-28 21:45:09 +04:00			`from .util import read_lang_data`
* Refactoring to use Tokens object 2014-09-10 20:11:13 +04:00			`from spacy.tokens import Tokens`
* Only store LexemeC structs in the vocabulary, transforming them to Lexeme objects for output. Moving away from Lexeme objects for Tokens soon. 2014-09-11 14:28:38 +04:00			`from spacy.lexeme cimport LexemeC, lexeme_init`
* Switch to 64 bit hashes, for better reliability 2014-09-12 04:04:47 +04:00			`from murmurhash.mrmr cimport hash64`
* Redesign proceeding 2014-08-28 21:45:09 +04:00
* Refactor for string view features. Working on setting up flags and enums. 2014-07-07 18:58:48 +04:00
* Reorganized code to accomodate Tokens class. Need string views before group_by and count_by can be done well. 2014-07-07 14:47:21 +04:00			`cdef class Language:`
* Redesign proceeding 2014-08-28 21:45:09 +04:00			`"""Base class for language-specific tokenizers.`

			`Most subclasses will override the _split or _split_one methods, which take`
			`a string of non-whitespace characters and output a list of strings. This`
			`function is called by _tokenize, which sits behind a cache and turns the`
			`list of strings into Lexeme objects via the Lexicon. Most languages will not`
			`need to override _tokenize or tokenize.`

			`The language is supplied a list of boolean functions, used to compute flag`
			`features. These are passed to the language's Lexicon object.`

			`The language's name is used to look up default data-files, found in data/<name.`
			`"""`
* Begin testing more functionality 2014-08-30 21:01:15 +04:00			`def __cinit__(self, name, string_features, flag_features):`
* Redesign proceeding 2014-08-28 21:45:09 +04:00			`if flag_features is None:`
			`flag_features = []`
			`if string_features is None:`
			`string_features = []`
* Reorganized code to accomodate Tokens class. Need string views before group_by and count_by can be done well. 2014-07-07 14:47:21 +04:00			`self.name = name`
* Fix performance issues by implementing a better cache. Add own String struct to help 2014-09-13 01:50:37 +04:00			`self.cache.set_empty_key(0)`
			`self.specials.set_empty_key(0)`
* Redesign proceeding 2014-08-28 21:45:09 +04:00			`lang_data = read_lang_data(name)`
			`rules, words, probs, clusters, case_stats, tag_stats = lang_data`
			`self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,`
			`string_features, flag_features)`
* Docs coming together 2014-08-29 03:59:23 +04:00			`self._load_special_tokenization(rules)`
* Refactor to use tokens class. 2014-09-10 20:27:44 +04:00			`self.tokens_class = Tokens`
* Refactoring to use Tokens object 2014-09-10 20:11:13 +04:00
* Fix performance issues by implementing a better cache. Add own String struct to help 2014-09-13 01:50:37 +04:00			`def __dealloc__(self):`
			`cdef uint64_t hashed`
			`cdef size_t lex_addr`
			`for (hashed, lex_addr) in self.specials:`
			`free(<LexemeC*>lex_addr)`

* Refactoring to use Tokens object 2014-09-10 20:11:13 +04:00			`property nr_types:`
			`def __get__(self):`
			`"""Return the number of lexical types in the vocabulary"""`
			`return self.lexicon.size`

			`cpdef Lexeme lookup(self, unicode string):`
			`"""Retrieve (or create, if not found) a Lexeme for a string, and return it.`

			`Args:`
			`string (unicode): The string to be looked up. Must be unicode, not bytes.`

			`Returns:`
			`lexeme (Lexeme): A reference to a lexical type.`
			`"""`
			`return self.lexicon.lookup(string)`
* Broken version being refactored for docs 2014-08-20 15:39:39 +04:00
* Switch to returning a Tokens object 2014-09-11 23:37:32 +04:00			`cpdef Tokens tokenize(self, unicode string):`
* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 19:15:39 +04:00			`"""Tokenize a string.`

			`The tokenization rules are defined in two places:`
* Broken version being refactored for docs 2014-08-20 15:39:39 +04:00
* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 19:15:39 +04:00			`* The data/<lang>/tokenization table, which handles special cases like contractions;`
			* The appropriate :py:meth:`find_split` function, which is used to split
			`off punctuation etc.`
* Reorganized code to accomodate Tokens class. Need string views before group_by and count_by can be done well. 2014-07-07 14:47:21 +04:00
* Broken version being refactored for docs 2014-08-20 15:39:39 +04:00			`Args:`
* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 19:15:39 +04:00			`string (unicode): The string to be tokenized.`
* Broken version being refactored for docs 2014-08-20 15:39:39 +04:00
			`Returns:`
* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 19:15:39 +04:00			`tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.`
* Broken version being refactored for docs 2014-08-20 15:39:39 +04:00			`"""`
* Working version that uses arrays for chunks, which should be more memory efficient 2014-08-18 22:23:54 +04:00			`cdef size_t length = len(string)`
* Switch to returning a Tokens object 2014-09-11 23:37:32 +04:00			`cdef Tokens tokens = self.tokens_class(length)`
* Only store LexemeC structs in the vocabulary, transforming them to Lexeme objects for output. Moving away from Lexeme objects for Tokens soon. 2014-09-11 14:28:38 +04:00			`if length == 0:`
* Switch to returning a Tokens object 2014-09-11 23:37:32 +04:00			`return tokens`
* Only store LexemeC structs in the vocabulary, transforming them to Lexeme objects for output. Moving away from Lexeme objects for Tokens soon. 2014-09-11 14:28:38 +04:00
* Working version, adding improvements 2014-08-18 21:59:59 +04:00			`cdef size_t start = 0`
* Roll back to using unicode, and never Py_UNICODE. No dependence on murmurhash either. 2014-08-18 22:48:48 +04:00			`cdef size_t i = 0`
* Fix performance issues by implementing a better cache. Add own String struct to help 2014-09-13 01:50:37 +04:00			`cdef Py_UNICODE* chars = string`
* Switch to 64 bit hashes, for better reliability 2014-09-12 04:04:47 +04:00			`cdef Py_UNICODE c`
* Fix performance issues by implementing a better cache. Add own String struct to help 2014-09-13 01:50:37 +04:00			`cdef String span`
* Switch to 64 bit hashes, for better reliability 2014-09-12 04:04:47 +04:00			`for i in range(length):`
* Fix performance issues by implementing a better cache. Add own String struct to help 2014-09-13 01:50:37 +04:00			`c = chars[i]`
* Fix bug with trailing punct on contractions. Reduced efficiency, and slightly hacky implementation. 2014-09-12 20:00:42 +04:00			`if Py_UNICODE_ISSPACE(c) == 1:`
* Working version, adding improvements 2014-08-18 21:59:59 +04:00			`if start < i:`
* Fix performance issues by implementing a better cache. Add own String struct to help 2014-09-13 01:50:37 +04:00			`string_from_slice(&span, chars, start, i)`
			`self._tokenize(tokens, &span)`
* Working version, adding improvements 2014-08-18 21:59:59 +04:00			`start = i + 1`
* Switch to 64 bit hashes, for better reliability 2014-09-12 04:04:47 +04:00			`i += 1`
* Working version, adding improvements 2014-08-18 21:59:59 +04:00			`if start < i:`
* Fix performance issues by implementing a better cache. Add own String struct to help 2014-09-13 01:50:37 +04:00			`string_from_slice(&span, chars, start, i)`
			`self._tokenize(tokens, &span)`
* Switch to returning a Tokens object 2014-09-11 23:37:32 +04:00			`return tokens`
* Reorganized code to accomodate Tokens class. Need string views before group_by and count_by can be done well. 2014-07-07 14:47:21 +04:00
* Efficiency tweaks 2014-09-13 02:14:05 +04:00			`cdef int _tokenize(self, Tokens tokens, String* string):`
* More performance fiddling, particularly moving the specials into the cache, so that we can just lookup the cache in _tokenize 2014-09-13 02:59:34 +04:00			`cdef LexemeC lexemes = <LexemeC>self.cache[string.key]`
			`lexemes = <LexemeC**>self.cache[string.key]`
			`cdef size_t i`
* Fix performance issues by implementing a better cache. Add own String struct to help 2014-09-13 01:50:37 +04:00			`if lexemes != NULL:`
* More performance fiddling, particularly moving the specials into the cache, so that we can just lookup the cache in _tokenize 2014-09-13 02:59:34 +04:00			`i = 0`
			`while lexemes[i] != NULL:`
			`tokens.push_back(lexemes[i])`
			`i += 1`
* Tweak signatures and refactor slightly. Processing gigaword taking 8-9 mins. Tests passing, but some sort of memory bug on exit. 2014-09-12 04:43:36 +04:00			`return 0`
* Fix performance issues by implementing a better cache. Add own String struct to help 2014-09-13 01:50:37 +04:00			`cdef uint64_t hashed = string.key`
* Fix bug with trailing punct on contractions. Reduced efficiency, and slightly hacky implementation. 2014-09-12 20:00:42 +04:00
* Fix performance issues by implementing a better cache. Add own String struct to help 2014-09-13 01:50:37 +04:00			`cdef size_t first_token = tokens.length`
* Fiddle with declarations, for small efficiency boost 2014-09-13 02:31:53 +04:00			`cdef int split`
* Efficiency tweaks 2014-09-13 02:14:05 +04:00			`cdef int remaining = string.n`
* Fix performance issues by implementing a better cache. Add own String struct to help 2014-09-13 01:50:37 +04:00			`cdef String prefix`
* Efficiency tweaks 2014-09-13 02:14:05 +04:00			`while remaining >= 1:`
* Fix performance issues by implementing a better cache. Add own String struct to help 2014-09-13 01:50:37 +04:00			`split = self._split_one(string.chars, string.n)`
* Efficiency tweaks 2014-09-13 02:14:05 +04:00			`remaining -= split`
* Fix performance issues by implementing a better cache. Add own String struct to help 2014-09-13 01:50:37 +04:00			`string_slice_prefix(string, &prefix, split)`
			`lexemes = <LexemeC**>self.specials[prefix.key]`
			`if lexemes != NULL:`
* More performance fiddling, particularly moving the specials into the cache, so that we can just lookup the cache in _tokenize 2014-09-13 02:59:34 +04:00			`i = 0`
			`while lexemes[i] != NULL:`
			`tokens.push_back(lexemes[i])`
			`i += 1`
* Changed cache to use a linked-list data structure, to take out Python list code. Taking 6-7 mins for gigaword. 2014-09-12 05:30:50 +04:00			`else:`
* Fix performance issues by implementing a better cache. Add own String struct to help 2014-09-13 01:50:37 +04:00			`tokens.push_back(<LexemeC*>self.lexicon.get(&prefix))`
			`lexemes = <LexemeC*>calloc(tokens.length - first_token, sizeof(LexemeC))`
* Efficiency tweaks 2014-09-13 02:14:05 +04:00			`cdef size_t j`
* Fix performance issues by implementing a better cache. Add own String struct to help 2014-09-13 01:50:37 +04:00			`for i, j in enumerate(range(first_token, tokens.length)):`
			`lexemes[i] = tokens.lexemes[j]`
			`self.cache[hashed] = <size_t>lexemes`
* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 19:15:39 +04:00
* Replace main lexicon dict with dense_hash_map. May be unsuitable, if strings need recovery. 2014-09-12 06:29:09 +04:00			`cdef int _split_one(self, Py_UNICODE* characters, size_t length):`
			`return length`
* Refactoring tokenizer 2014-08-16 05:22:03 +04:00
* Docs coming together 2014-08-29 03:59:23 +04:00			`def _load_special_tokenization(self, token_rules):`
* Broken version being refactored for docs 2014-08-20 15:39:39 +04:00			`'''Load special-case tokenization rules.`

* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 19:15:39 +04:00			`Loads special-case tokenization rules into the Language.cache cache,`
* Broken version being refactored for docs 2014-08-20 15:39:39 +04:00			`read from data/<lang>/tokenization . The special cases are loaded before`
			`any language data is tokenized, giving these priority. For instance,`
			`the English tokenization rules map "ain't" to ["are", "not"].`

			`Args:`
			`token_rules (list): A list of (chunk, tokens) pairs, where chunk is`
			`a string and tokens is a list of strings.`
			`'''`
* Fix performance issues by implementing a better cache. Add own String struct to help 2014-09-13 01:50:37 +04:00			`cdef LexemeC** lexemes`
* Switch to 64 bit hashes, for better reliability 2014-09-12 04:04:47 +04:00			`cdef uint64_t hashed`
* Fix performance issues by implementing a better cache. Add own String struct to help 2014-09-13 01:50:37 +04:00			`cdef String string`
			`for uni_string, substrings in token_rules:`
			`lexemes = <LexemeC*>calloc(len(substrings) + 1, sizeof(LexemeC))`
			`for i, substring in enumerate(substrings):`
			`string_from_unicode(&string, substring)`
			`lexemes[i] = <LexemeC*>self.lexicon.get(&string)`
			`lexemes[i + 1] = NULL`
			`string_from_unicode(&string, uni_string)`
			`self.specials[string.key] = <size_t>lexemes`
* More performance fiddling, particularly moving the specials into the cache, so that we can just lookup the cache in _tokenize 2014-09-13 02:59:34 +04:00			`self.cache[string.key] = <size_t>lexemes`
* Fix performance issues by implementing a better cache. Add own String struct to help 2014-09-13 01:50:37 +04:00
* Working version, adding improvements 2014-08-18 21:59:59 +04:00
* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 19:15:39 +04:00			`cdef class Lexicon:`
* Redesign proceeding 2014-08-28 21:45:09 +04:00			`def __cinit__(self, words, probs, clusters, case_stats, tag_stats,`
			`string_features, flag_features):`
* Docs coming together 2014-08-29 03:59:23 +04:00			`self._flag_features = flag_features`
			`self._string_features = string_features`
* Replace main lexicon dict with dense_hash_map. May be unsuitable, if strings need recovery. 2014-09-12 06:29:09 +04:00			`self._dict.set_empty_key(0)`
* Refactoring to use Tokens object 2014-09-10 20:11:13 +04:00			`self.size = 0`
* Redesign proceeding 2014-08-28 21:45:09 +04:00			`cdef Lexeme word`
			`for string in words:`
* Only store LexemeC structs in the vocabulary, transforming them to Lexeme objects for output. Moving away from Lexeme objects for Tokens soon. 2014-09-11 14:28:38 +04:00			`prob = probs.get(string, 0.0)`
			`cluster = clusters.get(string, 0.0)`
			`cases = case_stats.get(string, {})`
			`tags = tag_stats.get(string, {})`
			`views = [string_view(string, prob, cluster, cases, tags)`
			`for string_view in self._string_features]`
			`flags = set()`
			`for i, flag_feature in enumerate(self._flag_features):`
			`if flag_feature(string, prob, cluster, cases, tags):`
			`flags.add(i)`
			`lexeme = lexeme_init(string, prob, cluster, views, flags)`
			`self._dict[string] = <size_t>lexeme`
* Refactoring to use Tokens object 2014-09-10 20:11:13 +04:00			`self.size += 1`
* Broken version being refactored for docs 2014-08-20 15:39:39 +04:00
* Fix performance issues by implementing a better cache. Add own String struct to help 2014-09-13 01:50:37 +04:00			`cdef size_t get(self, String* string):`
			`cdef LexemeC* lexeme = <LexemeC*>self._dict[string.key]`
* Replace main lexicon dict with dense_hash_map. May be unsuitable, if strings need recovery. 2014-09-12 06:29:09 +04:00			`if lexeme != NULL:`
			`return <size_t>lexeme`
* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 19:15:39 +04:00
* Fix performance issues by implementing a better cache. Add own String struct to help 2014-09-13 01:50:37 +04:00			`cdef unicode uni_string = string.chars[:string.n]`
			`views = [string_view(uni_string, 0.0, 0, {}, {})`
* Only store LexemeC structs in the vocabulary, transforming them to Lexeme objects for output. Moving away from Lexeme objects for Tokens soon. 2014-09-11 14:28:38 +04:00			`for string_view in self._string_features]`
			`flags = set()`
			`for i, flag_feature in enumerate(self._flag_features):`
* Fix performance issues by implementing a better cache. Add own String struct to help 2014-09-13 01:50:37 +04:00			`if flag_feature(uni_string, 0.0, {}, {}):`
* Only store LexemeC structs in the vocabulary, transforming them to Lexeme objects for output. Moving away from Lexeme objects for Tokens soon. 2014-09-11 14:28:38 +04:00			`flags.add(i)`

* Fix performance issues by implementing a better cache. Add own String struct to help 2014-09-13 01:50:37 +04:00			`lexeme = lexeme_init(uni_string, 0, 0, views, flags)`
			`self._dict[string.key] = <size_t>lexeme`
* Refactoring to use Tokens object 2014-09-10 20:11:13 +04:00			`self.size += 1`
* Moving to storing LexemeC structs internally 2014-09-11 23:54:34 +04:00			`return <size_t>lexeme`

* Fix performance issues by implementing a better cache. Add own String struct to help 2014-09-13 01:50:37 +04:00			`cpdef Lexeme lookup(self, unicode uni_string):`
* Moving to storing LexemeC structs internally 2014-09-11 23:54:34 +04:00			`"""Retrieve (or create, if not found) a Lexeme for a string, and return it.`

			`Args`
			`string (unicode): The string to be looked up. Must be unicode, not bytes.`

			`Returns:`
			`lexeme (Lexeme): A reference to a lexical type.`
			`"""`
* Fix performance issues by implementing a better cache. Add own String struct to help 2014-09-13 01:50:37 +04:00			`cdef String string`
			`string_from_unicode(&string, uni_string)`
			`cdef size_t lexeme = self.get(&string)`
* Moving to storing LexemeC structs internally 2014-09-11 23:54:34 +04:00			`return Lexeme(lexeme)`
* Fix performance issues by implementing a better cache. Add own String struct to help 2014-09-13 01:50:37 +04:00

* Efficiency tweaks 2014-09-13 02:14:05 +04:00			`cdef void string_from_unicode(String* s, unicode uni):`
* Fix performance issues by implementing a better cache. Add own String struct to help 2014-09-13 01:50:37 +04:00			`string_from_slice(s, <Py_UNICODE*>uni, 0, len(uni))`


* Fiddle with declarations, for small efficiency boost 2014-09-13 02:31:53 +04:00			`cdef inline void string_from_slice(String* s, Py_UNICODE* chars, size_t start, size_t end) nogil:`
* Fix performance issues by implementing a better cache. Add own String struct to help 2014-09-13 01:50:37 +04:00			`s.chars = &chars[start]`
			`s.n = end - start`
			`s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)`


* Fiddle with declarations, for small efficiency boost 2014-09-13 02:31:53 +04:00			`cdef inline void string_slice_prefix(String* s, String* prefix, size_t n) nogil:`
* Fix performance issues by implementing a better cache. Add own String struct to help 2014-09-13 01:50:37 +04:00			`string_from_slice(prefix, s.chars, 0, n)`
			`s.chars += n`
			`s.n -= n`
			`s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)`