spaCy/spacy/tokenizer.pyx

# cython: embedsignature=True
from __future__ import unicode_literals

from os import path
import re

from cython.operator cimport dereference as deref
from cython.operator cimport preincrement as preinc

from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap

from .structs cimport UniStr
from .strings cimport slice_unicode
from .morphology cimport set_morph_from_dict

from . import util
from .util import read_lang_data
from .tokens.doc cimport Doc


cdef class Tokenizer:
    def __init__(self, Vocab vocab, rules, prefix_re, suffix_re, infix_re):
        self.mem = Pool()
        self._cache = PreshMap()
        self._specials = PreshMap()
        self._prefix_re = prefix_re
        self._suffix_re = suffix_re
        self._infix_re = infix_re
        self.vocab = vocab
        self._load_special_tokenization(rules, self.vocab.pos_tags)

    @classmethod
    def from_dir(cls, Vocab vocab, data_dir):
        rules, prefix_re, suffix_re, infix_re = read_lang_data(data_dir)
        prefix_re = re.compile(prefix_re)
        suffix_re = re.compile(suffix_re)
        infix_re = re.compile(infix_re)
        return cls(vocab, rules, prefix_re, suffix_re, infix_re)

    cpdef Doc tokens_from_list(self, list strings):
        cdef Doc tokens = Doc(self.vocab)
        if sum([len(s) for s in strings]) == 0:
            return tokens
        cdef UniStr string_struct
        cdef unicode py_string
        cdef int idx = 0
        for i, py_string in enumerate(strings):
            slice_unicode(&string_struct, py_string, 0, len(py_string))
            # Note that we pass tokens.mem here --- the Doc object has ownership
            tokens.push_back(
                <const LexemeC*>self.vocab.get(tokens.mem, &string_struct), True)
            idx += len(py_string) + 1
        return tokens

    def __call__(self, unicode string):
        """Tokenize a string.

        The tokenization rules are defined in three places:

        * The data/<lang>/tokenization table, which handles special cases like contractions;
        * The data/<lang>/prefix file, used to build a regex to split off prefixes;
        * The data/<lang>/suffix file, used to build a regex to split off suffixes.

        The string is first split on whitespace.  To tokenize a whitespace-delimited
        chunk, we first try to look it up in the special-cases. If it's not found,
        we split off a prefix, and then try again. If it's still not found, we
        split off a suffix, and repeat.

        Args:
            string (unicode): The string to be tokenized.

        Returns:
            tokens (Doc): A Doc object, giving access to a sequence of LexemeCs.
        """
        cdef int length = len(string)
        cdef Doc tokens = Doc(self.vocab)
        if length == 0:
            return tokens
        cdef int i = 0
        cdef int start = 0
        cdef bint cache_hit
        cdef Py_UNICODE* chars = string
        cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
        cdef UniStr span
        for i in range(1, length):
            if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
                if start < i:
                    slice_unicode(&span, chars, start, i)
                    cache_hit = self._try_cache(span.key, tokens)
                    if not cache_hit:
                        self._tokenize(tokens, &span, start, i)
                in_ws = not in_ws
                start = i
                if chars[i] == ' ':
                    tokens.data[tokens.length - 1].spacy = True
                    start += 1
        i += 1
        if start < i:
            slice_unicode(&span, chars, start, i)
            cache_hit = self._try_cache(span.key, tokens)
            if not cache_hit:
                self._tokenize(tokens, &span, start, i)

            tokens.data[tokens.length - 1].spacy = string[-1] == ' '
        return tokens

    cdef int _try_cache(self, hash_t key, Doc tokens) except -1:
        cached = <_Cached*>self._cache.get(key)
        if cached == NULL:
            return False
        cdef int i
        if cached.is_lex:
            for i in range(cached.length):
                tokens.push_back(cached.data.lexemes[i], False)
        else:
            for i in range(cached.length):
                tokens.push_back(&cached.data.tokens[i], False)
        return True

    cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1:
        cdef vector[LexemeC*] prefixes
        cdef vector[LexemeC*] suffixes
        cdef hash_t orig_key
        cdef int orig_size
        orig_key = span.key
        orig_size = tokens.length
        self._split_affixes(span, &prefixes, &suffixes)
        self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
        self._save_cached(&tokens.data[orig_size], orig_key, tokens.length - orig_size)

    cdef UniStr* _split_affixes(self, UniStr* string, vector[const LexemeC*] *prefixes,
                                vector[const LexemeC*] *suffixes) except NULL:
        cdef size_t i
        cdef UniStr prefix
        cdef UniStr suffix
        cdef UniStr minus_pre
        cdef UniStr minus_suf
        cdef size_t last_size = 0
        while string.n != 0 and string.n != last_size:
            last_size = string.n
            pre_len = self._find_prefix(string.chars, string.n)
            if pre_len != 0:
                slice_unicode(&prefix, string.chars, 0, pre_len)
                slice_unicode(&minus_pre, string.chars, pre_len, string.n)
                # Check whether we've hit a special-case
                if minus_pre.n >= 1 and self._specials.get(minus_pre.key) != NULL:
                    string[0] = minus_pre
                    prefixes.push_back(self.vocab.get(self.vocab.mem, &prefix))
                    break
            suf_len = self._find_suffix(string.chars, string.n)
            if suf_len != 0:
                slice_unicode(&suffix, string.chars, string.n - suf_len, string.n)
                slice_unicode(&minus_suf, string.chars, 0, string.n - suf_len)
                # Check whether we've hit a special-case
                if minus_suf.n >= 1 and self._specials.get(minus_suf.key) != NULL:
                    string[0] = minus_suf
                    suffixes.push_back(self.vocab.get(self.vocab.mem, &suffix))
                    break
            if pre_len and suf_len and (pre_len + suf_len) <= string.n:
                slice_unicode(string, string.chars, pre_len, string.n - suf_len)
                prefixes.push_back(self.vocab.get(self.vocab.mem, &prefix))
                suffixes.push_back(self.vocab.get(self.vocab.mem, &suffix))
            elif pre_len:
                string[0] = minus_pre
                prefixes.push_back(self.vocab.get(self.vocab.mem, &prefix))
            elif suf_len:
                string[0] = minus_suf
                suffixes.push_back(self.vocab.get(self.vocab.mem, &suffix))
            if self._specials.get(string.key):
                break
        return string

    cdef int _attach_tokens(self, Doc tokens, int idx, UniStr* string,
                            vector[const LexemeC*] *prefixes,
                            vector[const LexemeC*] *suffixes) except -1:
        cdef bint cache_hit
        cdef int split, end
        cdef const LexemeC* const* lexemes
        cdef const LexemeC* lexeme
        cdef UniStr span
        cdef int i
        if prefixes.size():
            for i in range(prefixes.size()):
                tokens.push_back(prefixes[0][i], False)
        if string.n != 0:
            cache_hit = self._try_cache(string.key, tokens)
            if cache_hit:
                pass
            else:
                match = self._find_infix(string.chars, string.n)
                if match is None:
                    tokens.push_back(self.vocab.get(tokens.mem, string), False)
                else:
                    split = match.start()
                    end = match.end()
                    # Append the beginning, afix, end of the infix span
                    slice_unicode(&span, string.chars, 0, split)
                    tokens.push_back(self.vocab.get(tokens.mem, &span), False)
                    
                    slice_unicode(&span, string.chars, split, end)
                    tokens.push_back(self.vocab.get(tokens.mem, &span), False)
                    
                    slice_unicode(&span, string.chars, end, string.n)
                    tokens.push_back(self.vocab.get(tokens.mem, &span), False)
        cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
        while it != suffixes.rend():
            lexeme = deref(it)
            preinc(it)
            tokens.push_back(lexeme, False)

    cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1:
        cdef int i
        for i in range(n):
            if tokens[i].lex.id == 0:
                return 0
        cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
        cached.length = n
        cached.is_lex = True
        lexemes = <const LexemeC**>self.mem.alloc(n, sizeof(LexemeC**))
        for i in range(n):
            lexemes[i] = tokens[i].lex
        cached.data.lexemes = <const LexemeC* const*>lexemes
        self._cache.set(key, cached)

    cdef object _find_infix(self, Py_UNICODE* chars, size_t length):
        cdef unicode string = chars[:length]
        return self._infix_re.search(string)

    cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1:
        cdef unicode string = chars[:length]
        match = self._prefix_re.search(string)
        return (match.end() - match.start()) if match is not None else 0

    cdef int _find_suffix(self, Py_UNICODE* chars, size_t length) except -1:
        cdef unicode string = chars[:length]
        match = self._suffix_re.search(string)
        return (match.end() - match.start()) if match is not None else 0

    def _load_special_tokenization(self, object rules, object tag_map):
        '''Add a special-case tokenization rule.
        '''
        cdef int i
        cdef unicode chunk
        cdef list substrings
        cdef unicode form
        cdef unicode lemma
        cdef dict props
        cdef LexemeC** lexemes
        cdef hash_t hashed
        cdef UniStr string
        for chunk, substrings in sorted(rules.items()):
            tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
            for i, props in enumerate(substrings):
                form = props['F']
                lemma = props.get("L", None)
                slice_unicode(&string, form, 0, len(form))
                tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, &string)
                if lemma is not None:
                    tokens[i].lemma = self.vocab.strings[lemma]
                else:
                    tokens[i].lemma = 0
                if 'pos' in props:
                    tokens[i].tag = self.vocab.strings[props['pos']]
                    tokens[i].pos = tag_map[props['pos']][0]
                    # These are defaults, which can be over-ridden by the
                    # token-specific props.
                    set_morph_from_dict(&tokens[i].morph, tag_map[props['pos']][1])
                    if tokens[i].lemma == 0:
                        tokens[i].lemma = tokens[i].lex.orth
                set_morph_from_dict(&tokens[i].morph, props)
            cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
            cached.length = len(substrings)
            cached.is_lex = False
            cached.data.tokens = tokens
            slice_unicode(&string, chunk, 0, len(chunk))
            self._specials.set(string.key, cached)
            self._cache.set(string.key, cached)
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`# cython: embedsignature=True`
			`from __future__ import unicode_literals`

* Tmp 2014-12-20 21:36:29 +03:00			`from os import path`
* Work on train 2014-12-21 23:25:43 +03:00			`import re`
* Tmp 2014-12-20 21:36:29 +03:00
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`from cython.operator cimport dereference as deref`
			`from cython.operator cimport preincrement as preinc`

			`from cymem.cymem cimport Pool`
			`from preshed.maps cimport PreshMap`

			`from .structs cimport UniStr`
			`from .strings cimport slice_unicode`
			`from .morphology cimport set_morph_from_dict`

			`from . import util`
			`from .util import read_lang_data`
* Break up tokens.pyx into tokens/doc.pyx, tokens/token.pyx, tokens/spans.pyx 2015-07-13 21:20:58 +03:00			`from .tokens.doc cimport Doc`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00

			`cdef class Tokenizer:`
* Remove redundant tag_names argument to Tokenizer 2015-07-08 13:36:04 +03:00			`def __init__(self, Vocab vocab, rules, prefix_re, suffix_re, infix_re):`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`self.mem = Pool()`
			`self._cache = PreshMap()`
			`self._specials = PreshMap()`
			`self._prefix_re = prefix_re`
			`self._suffix_re = suffix_re`
			`self._infix_re = infix_re`
* Work on train 2014-12-21 23:25:43 +03:00			`self.vocab = vocab`
* Remove redundant tag_names argument to Tokenizer 2015-07-08 13:36:04 +03:00			`self._load_special_tokenization(rules, self.vocab.pos_tags)`
* Begin refactor 2015-07-07 15:00:07 +03:00
			`@classmethod`
* Remove redundant tag_names argument to Tokenizer 2015-07-08 13:36:04 +03:00			`def from_dir(cls, Vocab vocab, data_dir):`
* Pass pos_tags into Tokenizer.from_dir 2015-07-07 15:23:08 +03:00			`rules, prefix_re, suffix_re, infix_re = read_lang_data(data_dir)`
* Begin refactor 2015-07-07 15:00:07 +03:00			`prefix_re = re.compile(prefix_re)`
			`suffix_re = re.compile(suffix_re)`
			`infix_re = re.compile(infix_re)`
* Remove redundant tag_names argument to Tokenizer 2015-07-08 13:36:04 +03:00			`return cls(vocab, rules, prefix_re, suffix_re, infix_re)`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00
* Rename Tokens to Doc 2015-07-08 19:53:00 +03:00			`cpdef Doc tokens_from_list(self, list strings):`
* Refactor tokenizer, to set the 'spacy' field on TokenC instead of passing a string 2015-07-13 22:46:02 +03:00			`cdef Doc tokens = Doc(self.vocab)`
			`if sum([len(s) for s in strings]) == 0:`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`return tokens`
			`cdef UniStr string_struct`
			`cdef unicode py_string`
			`cdef int idx = 0`
			`for i, py_string in enumerate(strings):`
			`slice_unicode(&string_struct, py_string, 0, len(py_string))`
* Refactor tokenizer, to set the 'spacy' field on TokenC instead of passing a string 2015-07-13 22:46:02 +03:00			`# Note that we pass tokens.mem here --- the Doc object has ownership`
			`tokens.push_back(`
			`<const LexemeC*>self.vocab.get(tokens.mem, &string_struct), True)`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`idx += len(py_string) + 1`
			`return tokens`

* Work on word vectors, and other stuff 2015-01-17 08:21:17 +03:00			`def __call__(self, unicode string):`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`"""Tokenize a string.`

			`The tokenization rules are defined in three places:`

			`* The data/<lang>/tokenization table, which handles special cases like contractions;`
			`* The data/<lang>/prefix file, used to build a regex to split off prefixes;`
			`* The data/<lang>/suffix file, used to build a regex to split off suffixes.`

			`The string is first split on whitespace. To tokenize a whitespace-delimited`
			`chunk, we first try to look it up in the special-cases. If it's not found,`
			`we split off a prefix, and then try again. If it's still not found, we`
			`split off a suffix, and repeat.`

			`Args:`
Remove trailing whitespace 2015-04-19 11:31:31 +03:00			`string (unicode): The string to be tokenized.`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00
			`Returns:`
* Rename Tokens to Doc 2015-07-08 19:53:00 +03:00			`tokens (Doc): A Doc object, giving access to a sequence of LexemeCs.`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`"""`
			`cdef int length = len(string)`
* Refactor tokenizer, to set the 'spacy' field on TokenC instead of passing a string 2015-07-13 22:46:02 +03:00			`cdef Doc tokens = Doc(self.vocab)`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`if length == 0:`
			`return tokens`
			`cdef int i = 0`
			`cdef int start = 0`
			`cdef bint cache_hit`
			`cdef Py_UNICODE* chars = string`
			`cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])`
			`cdef UniStr span`
			`for i in range(1, length):`
* Remove hyphenation from main tokenizer loop: do it in infix.txt instead. This lets emoticons work 2015-06-06 06:57:03 +03:00			`if Py_UNICODE_ISSPACE(chars[i]) != in_ws:`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`if start < i:`
			`slice_unicode(&span, chars, start, i)`
* Refactor tokenizer, to set the 'spacy' field on TokenC instead of passing a string 2015-07-13 22:46:02 +03:00			`cache_hit = self._try_cache(span.key, tokens)`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`if not cache_hit:`
			`self._tokenize(tokens, &span, start, i)`
			`in_ws = not in_ws`
			`start = i`
			`if chars[i] == ' ':`
* Refactor tokenizer, to set the 'spacy' field on TokenC instead of passing a string 2015-07-13 22:46:02 +03:00			`tokens.data[tokens.length - 1].spacy = True`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`start += 1`
			`i += 1`
			`if start < i:`
			`slice_unicode(&span, chars, start, i)`
* Refactor tokenizer, to set the 'spacy' field on TokenC instead of passing a string 2015-07-13 22:46:02 +03:00			`cache_hit = self._try_cache(span.key, tokens)`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`if not cache_hit:`
			`self._tokenize(tokens, &span, start, i)`
* Refactor tokenizer, to set the 'spacy' field on TokenC instead of passing a string 2015-07-13 22:46:02 +03:00
			`tokens.data[tokens.length - 1].spacy = string[-1] == ' '`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`return tokens`

* Refactor tokenizer, to set the 'spacy' field on TokenC instead of passing a string 2015-07-13 22:46:02 +03:00			`cdef int _try_cache(self, hash_t key, Doc tokens) except -1:`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`cached = <_Cached*>self._cache.get(key)`
			`if cached == NULL:`
			`return False`
			`cdef int i`
			`if cached.is_lex:`
* Fix tokenizer 2015-07-14 01:10:51 +03:00			`for i in range(cached.length):`
* Refactor tokenizer, to set the 'spacy' field on TokenC instead of passing a string 2015-07-13 22:46:02 +03:00			`tokens.push_back(cached.data.lexemes[i], False)`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`else:`
* Fix tokenizer 2015-07-14 01:10:51 +03:00			`for i in range(cached.length):`
* Refactor tokenizer, to set the 'spacy' field on TokenC instead of passing a string 2015-07-13 22:46:02 +03:00			`tokens.push_back(&cached.data.tokens[i], False)`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`return True`

* Rename Tokens to Doc 2015-07-08 19:53:00 +03:00			`cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1:`
* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-12 02:26:22 +03:00			`cdef vector[LexemeC*] prefixes`
			`cdef vector[LexemeC*] suffixes`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`cdef hash_t orig_key`
			`cdef int orig_size`
			`orig_key = span.key`
			`orig_size = tokens.length`
			`self._split_affixes(span, &prefixes, &suffixes)`
			`self._attach_tokens(tokens, start, span, &prefixes, &suffixes)`
			`self._save_cached(&tokens.data[orig_size], orig_key, tokens.length - orig_size)`

* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-12 02:26:22 +03:00			`cdef UniStr* _split_affixes(self, UniStr* string, vector[const LexemeC] prefixes,`
			`vector[const LexemeC] suffixes) except NULL:`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`cdef size_t i`
			`cdef UniStr prefix`
			`cdef UniStr suffix`
			`cdef UniStr minus_pre`
			`cdef UniStr minus_suf`
			`cdef size_t last_size = 0`
			`while string.n != 0 and string.n != last_size:`
			`last_size = string.n`
			`pre_len = self._find_prefix(string.chars, string.n)`
			`if pre_len != 0:`
			`slice_unicode(&prefix, string.chars, 0, pre_len)`
			`slice_unicode(&minus_pre, string.chars, pre_len, string.n)`
			`# Check whether we've hit a special-case`
			`if minus_pre.n >= 1 and self._specials.get(minus_pre.key) != NULL:`
			`string[0] = minus_pre`
			`prefixes.push_back(self.vocab.get(self.vocab.mem, &prefix))`
			`break`
			`suf_len = self._find_suffix(string.chars, string.n)`
			`if suf_len != 0:`
			`slice_unicode(&suffix, string.chars, string.n - suf_len, string.n)`
			`slice_unicode(&minus_suf, string.chars, 0, string.n - suf_len)`
			`# Check whether we've hit a special-case`
			`if minus_suf.n >= 1 and self._specials.get(minus_suf.key) != NULL:`
			`string[0] = minus_suf`
			`suffixes.push_back(self.vocab.get(self.vocab.mem, &suffix))`
			`break`
			`if pre_len and suf_len and (pre_len + suf_len) <= string.n:`
			`slice_unicode(string, string.chars, pre_len, string.n - suf_len)`
			`prefixes.push_back(self.vocab.get(self.vocab.mem, &prefix))`
			`suffixes.push_back(self.vocab.get(self.vocab.mem, &suffix))`
			`elif pre_len:`
			`string[0] = minus_pre`
			`prefixes.push_back(self.vocab.get(self.vocab.mem, &prefix))`
			`elif suf_len:`
			`string[0] = minus_suf`
			`suffixes.push_back(self.vocab.get(self.vocab.mem, &suffix))`
			`if self._specials.get(string.key):`
			`break`
			`return string`

* Rename Tokens to Doc 2015-07-08 19:53:00 +03:00			`cdef int _attach_tokens(self, Doc tokens, int idx, UniStr* string,`
* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-12 02:26:22 +03:00			`vector[const LexemeC] prefixes,`
			`vector[const LexemeC] suffixes) except -1:`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`cdef bint cache_hit`
* Allow infix tokens to be variable length 2015-07-18 23:45:00 +03:00			`cdef int split, end`
* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-12 02:26:22 +03:00			`cdef const LexemeC* const* lexemes`
* Refactor tokenizer, to set the 'spacy' field on TokenC instead of passing a string 2015-07-13 22:46:02 +03:00			`cdef const LexemeC* lexeme`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`cdef UniStr span`
			`cdef int i`
			`if prefixes.size():`
			`for i in range(prefixes.size()):`
* Refactor tokenizer, to set the 'spacy' field on TokenC instead of passing a string 2015-07-13 22:46:02 +03:00			`tokens.push_back(prefixes[0][i], False)`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`if string.n != 0:`
* Refactor tokenizer, to set the 'spacy' field on TokenC instead of passing a string 2015-07-13 22:46:02 +03:00			`cache_hit = self._try_cache(string.key, tokens)`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`if cache_hit:`
* Refactor tokenizer, to set the 'spacy' field on TokenC instead of passing a string 2015-07-13 22:46:02 +03:00			`pass`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`else:`
* Allow infix tokens to be variable length 2015-07-18 23:45:00 +03:00			`match = self._find_infix(string.chars, string.n)`
			`if match is None:`
* Refactor tokenizer, to set the 'spacy' field on TokenC instead of passing a string 2015-07-13 22:46:02 +03:00			`tokens.push_back(self.vocab.get(tokens.mem, string), False)`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`else:`
* Allow infix tokens to be variable length 2015-07-18 23:45:00 +03:00			`split = match.start()`
			`end = match.end()`
			`# Append the beginning, afix, end of the infix span`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`slice_unicode(&span, string.chars, 0, split)`
* Refactor tokenizer, to set the 'spacy' field on TokenC instead of passing a string 2015-07-13 22:46:02 +03:00			`tokens.push_back(self.vocab.get(tokens.mem, &span), False)`
* Allow infix tokens to be variable length 2015-07-18 23:45:00 +03:00
			`slice_unicode(&span, string.chars, split, end)`
* Refactor tokenizer, to set the 'spacy' field on TokenC instead of passing a string 2015-07-13 22:46:02 +03:00			`tokens.push_back(self.vocab.get(tokens.mem, &span), False)`
* Allow infix tokens to be variable length 2015-07-18 23:45:00 +03:00
			`slice_unicode(&span, string.chars, end, string.n)`
* Refactor tokenizer, to set the 'spacy' field on TokenC instead of passing a string 2015-07-13 22:46:02 +03:00			`tokens.push_back(self.vocab.get(tokens.mem, &span), False)`
* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-12 02:26:22 +03:00			`cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`while it != suffixes.rend():`
* Refactor tokenizer, to set the 'spacy' field on TokenC instead of passing a string 2015-07-13 22:46:02 +03:00			`lexeme = deref(it)`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`preinc(it)`
* Refactor tokenizer, to set the 'spacy' field on TokenC instead of passing a string 2015-07-13 22:46:02 +03:00			`tokens.push_back(lexeme, False)`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00
			`cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1:`
			`cdef int i`
			`for i in range(n):`
* Update oov check in tokenizer 2015-07-18 23:45:28 +03:00			`if tokens[i].lex.id == 0:`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`return 0`
			`cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))`
			`cached.length = n`
			`cached.is_lex = True`
* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-12 02:26:22 +03:00			`lexemes = <const LexemeC>self.mem.alloc(n, sizeof(LexemeC))`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`for i in range(n):`
			`lexemes[i] = tokens[i].lex`
* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-12 02:26:22 +03:00			`cached.data.lexemes = <const LexemeC* const*>lexemes`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`self._cache.set(key, cached)`

* Allow infix tokens to be variable length 2015-07-18 23:45:00 +03:00			`cdef object _find_infix(self, Py_UNICODE* chars, size_t length):`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`cdef unicode string = chars[:length]`
* Allow infix tokens to be variable length 2015-07-18 23:45:00 +03:00			`return self._infix_re.search(string)`
Remove trailing whitespace 2015-04-19 11:31:31 +03:00
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1:`
			`cdef unicode string = chars[:length]`
			`match = self._prefix_re.search(string)`
			`return (match.end() - match.start()) if match is not None else 0`

			`cdef int _find_suffix(self, Py_UNICODE* chars, size_t length) except -1:`
			`cdef unicode string = chars[:length]`
			`match = self._suffix_re.search(string)`
			`return (match.end() - match.start()) if match is not None else 0`

* Begin refactor 2015-07-07 15:00:07 +03:00			`def _load_special_tokenization(self, object rules, object tag_map):`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`'''Add a special-case tokenization rule.`
			`'''`
			`cdef int i`
			`cdef unicode chunk`
			`cdef list substrings`
			`cdef unicode form`
			`cdef unicode lemma`
			`cdef dict props`
* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-12 02:26:22 +03:00			`cdef LexemeC** lexemes`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`cdef hash_t hashed`
			`cdef UniStr string`
			`for chunk, substrings in sorted(rules.items()):`
			`tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))`
			`for i, props in enumerate(substrings):`
			`form = props['F']`
			`lemma = props.get("L", None)`
			`slice_unicode(&string, form, 0, len(form))`
* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-12 02:26:22 +03:00			`tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, &string)`
* Compare to is not None, for more robustness 2015-03-26 05:17:24 +03:00			`if lemma is not None:`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`tokens[i].lemma = self.vocab.strings[lemma]`
* Fix Issue #24: Lemmas are empty when the L field is missing for special-cased tokens 2015-02-09 02:30:30 +03:00			`else:`
			`tokens[i].lemma = 0`
* Messily fix morphology and POS tags on special tokens. 2014-12-30 15:24:37 +03:00			`if 'pos' in props:`
* Load tag for specials.json token 2015-03-16 02:24:47 +03:00			`tokens[i].tag = self.vocab.strings[props['pos']]`
* Messily fix morphology and POS tags on special tokens. 2014-12-30 15:24:37 +03:00			`tokens[i].pos = tag_map[props['pos']][0]`
			`# These are defaults, which can be over-ridden by the`
			`# token-specific props.`
			`set_morph_from_dict(&tokens[i].morph, tag_map[props['pos']][1])`
* Fix Issue #24: Lemmas are empty when the L field is missing for special-cased tokens 2015-02-09 02:30:30 +03:00			`if tokens[i].lemma == 0:`
			`tokens[i].lemma = tokens[i].lex.orth`
* Move lang.pyx to tokenizer.pyx 2014-12-19 23:54:49 +03:00			`set_morph_from_dict(&tokens[i].morph, props)`
			`cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))`
			`cached.length = len(substrings)`
			`cached.is_lex = False`
			`cached.data.tokens = tokens`
			`slice_unicode(&string, chunk, 0, len(chunk))`
			`self._specials.set(string.key, cached)`
			`self._cache.set(string.key, cached)`