spaCy/spacy/spacy.pyx

# cython: profile=True
from __future__ import unicode_literals

from libc.stdlib cimport calloc, free
from libcpp.pair cimport pair
from cython.operator cimport dereference as deref

from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport BLANK_WORD
from murmurhash cimport mrmr

from spacy.string_tools cimport substr

from . import util
from os import path
cimport cython


#cdef inline StringHash hash_string(unicode string, size_t length):
#    '''Hash unicode with MurmurHash64A'''
#    return hash(string)
#    #cdef bytes byte_string = string.encode('utf8')
#    #return mrmr.hash32(<char*>byte_string, len(byte_string) * sizeof(char), 0)


def get_normalized(unicode lex, size_t length):
    if lex.isalpha() and lex.islower():
        return lex
    else:
        return get_word_shape(lex, length)


def get_word_shape(unicode lex, length):
    shape = ""
    last = ""
    shape_char = ""
    seq = 0
    for c in lex:
        if c.isalpha():
            if c.isupper():
                shape_char = "X"
            else:
                shape_char = "x"
        elif c.isdigit():
            shape_char = "d"
        else:
            shape_char = c
        if shape_char == last:
            seq += 1
        else:
            seq = 0
            last = shape_char
        if seq < 3:
            shape += shape_char
    assert shape
    return shape


def set_orth_flags(lex, length):
    return 0


cdef class Language:
    def __cinit__(self, name):
        self.name = name
        self.bacov = {}
        self.chunks = dense_hash_map[StringHash, size_t]()
        self.vocab = dense_hash_map[StringHash, size_t]()
        self.chunks.set_empty_key(0)
        self.vocab.set_empty_key(0)
        self.load_tokenization(util.read_tokenization(name))

    cdef Tokens tokenize(self, unicode string):
        cdef Lexeme** chunk
        cdef Tokens tokens = Tokens(self)
        cdef size_t length = len(string)
        cdef Py_UNICODE* characters = <Py_UNICODE*>string
        cdef Py_UNICODE c
        cdef size_t start = 0
        cdef size_t i
        for i in range(length):
            c = characters[i]
            if _is_whitespace(c):
                if start < i:
                    chunk = self.lookup_chunk(&characters[start], i - start)
                    _extend(tokens, chunk)
                start = i + 1
        if start < i:
            chunk = self.lookup_chunk(&characters[start], length - start)
            _extend(tokens, chunk)
        return tokens

    cdef Lexeme* lookup(self, unicode string) except NULL:
        if len(string) == 0:
            return &BLANK_WORD
        cdef Lexeme* word = <Lexeme*>self.vocab[hash(string)]
        if word == NULL:
            word = self.new_lexeme(string)
        return word

    cdef Lexeme** lookup_chunk(self, Py_UNICODE* c_string, size_t length) except NULL:
        cdef StringHash h = mrmr.hash32(c_string, length * sizeof(Py_UNICODE), 0)
        cdef Lexeme** chunk = <Lexeme**>self.chunks[h]
        cdef int split
        if chunk == NULL:
            chunk = self.new_chunk(c_string[:length], self.find_substrings(c_string[:length]))
            self.chunks[h] = <size_t>chunk
        return chunk

    cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL:
        cdef Lexeme** chunk = <Lexeme**>calloc(len(substrings) + 1, sizeof(Lexeme*))
        for i, substring in enumerate(substrings):
            chunk[i] = self.lookup(substring)
        chunk[i + 1] = NULL
        return chunk

    cdef Lexeme* new_lexeme(self, unicode string) except NULL:
        cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
        word.lex = hash(string)
        self.bacov[word.lex] = string
        word.orth = self.new_orth(string)
        word.dist = self.new_dist(string)
        self.vocab[word.lex] = <size_t>word
        return word

    cdef Orthography* new_orth(self, unicode lex) except NULL:
        cdef unicode last3
        cdef unicode norm
        cdef unicode shape
        cdef int length 

        length = len(lex)
        orth = <Orthography*>calloc(1, sizeof(Orthography))
        orth.first = lex[0]
            
        orth.length = length
        orth.flags = set_orth_flags(lex, orth.length)
        orth.norm = hash(lex)
        last3 = substr(lex, length - 3, length, length)
        orth.last3 = hash(last3)
        norm = get_normalized(lex, length)
        orth.norm = hash(norm)
        shape = get_word_shape(lex, length)
        orth.shape = hash(shape)

        self.bacov[orth.last3] = last3
        self.bacov[orth.norm] = norm
        self.bacov[orth.shape] = shape

        return orth

    cdef Distribution* new_dist(self, unicode lex) except NULL:
        dist = <Distribution*>calloc(1, sizeof(Distribution))
        return dist

    cdef unicode unhash(self, StringHash hash_value):
        '''Fetch a string from the reverse index, given its hash value.'''
        return self.bacov[hash_value]

    cpdef list find_substrings(self, unicode word):
        substrings = []
        while word:
            split = self.find_split(word)
            if split == 0:
                substrings.append(word)
                break
            substrings.append(word[:split])
            word = word[split:]
        return substrings

    cdef int find_split(self, unicode word):
        return len(word)

    def load_tokenization(self, token_rules=None):
        cdef StringHash h
        cdef Py_UNICODE* c_string
        cdef bytes byte_string
        for chunk, tokens in token_rules:
            length = len(chunk)
            c_string = <Py_UNICODE*>chunk
            h = mrmr.hash32(c_string, length * sizeof(Py_UNICODE), 0)
            self.chunks[h] = <size_t>self.new_chunk(chunk, tokens)

    def load_clusters(self):
        cdef Lexeme* w
        data_dir = path.join(path.dirname(__file__), '..', 'data', 'en')
        case_stats = util.load_case_stats(data_dir)
        brown_loc = path.join(data_dir, 'clusters')
        cdef size_t start 
        cdef int end 
        with util.utf8open(brown_loc) as browns_file:
            for i, line in enumerate(browns_file):
                cluster_str, token_string, freq_str = line.split()
                # Decode as a little-endian string, so that we can do & 15 to get
                # the first 4 bits. See redshift._parse_features.pyx
                cluster = int(cluster_str[::-1], 2)
                upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0))
                self.new_lexeme(token_string)


cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
    if c == ' ':
        return True
    elif c == '\n':
        return True
    elif c == '\t':
        return True
    else:
        return False


cdef int _extend(Tokens tokens, Lexeme** chunk) except -1:
    cdef size_t i = 0
    while chunk[i] != NULL:
        tokens.append(<Lexeme_addr>chunk[i])
        i += 1
* Fixed major efficiency problem, from not quite grokking pass by reference in cython c++ 2014-07-07 09:36:43 +04:00			`# cython: profile=True`
* Initial commit. Tests passing for punctuation handling. Need contractions, file transport, tokenize function, etc. 2014-07-05 22:51:42 +04:00			`from __future__ import unicode_literals`
* Reorganized, moving language-independent stuff to spacy. The functions in spacy ask for the dictionaries and split function on input, but the language-specific modules are curried versions that use the globals 2014-07-07 06:21:06 +04:00
* Switch to dynamically allocating array, based on the document length 2014-07-07 10:05:29 +04:00			`from libc.stdlib cimport calloc, free`
* Fix memory leak in tokenizer, caused by having a fixed vocab. 2014-07-31 21:19:38 +04:00			`from libcpp.pair cimport pair`
			`from cython.operator cimport dereference as deref`
* Switch to dynamically allocating array, based on the document length 2014-07-07 10:05:29 +04:00
* Refactor for string view features. Working on setting up flags and enums. 2014-07-07 18:58:48 +04:00			`from spacy.lexeme cimport Lexeme`
* Reorganized, moving language-independent stuff to spacy. The functions in spacy ask for the dictionaries and split function on input, but the language-specific modules are curried versions that use the globals 2014-07-07 06:21:06 +04:00			`from spacy.lexeme cimport BLANK_WORD`
* Working version, adding improvements 2014-08-18 21:59:59 +04:00			`from murmurhash cimport mrmr`
* Reorganized, moving language-independent stuff to spacy. The functions in spacy ask for the dictionaries and split function on input, but the language-specific modules are curried versions that use the globals 2014-07-07 06:21:06 +04:00
* Refactor for string view features. Working on setting up flags and enums. 2014-07-07 18:58:48 +04:00			`from spacy.string_tools cimport substr`
* Reorganized, moving language-independent stuff to spacy. The functions in spacy ask for the dictionaries and split function on input, but the language-specific modules are curried versions that use the globals 2014-07-07 06:21:06 +04:00
			`from . import util`
* Fixed major efficiency problem, from not quite grokking pass by reference in cython c++ 2014-07-07 09:36:43 +04:00			`from os import path`
			`cimport cython`
* Reorganized, moving language-independent stuff to spacy. The functions in spacy ask for the dictionaries and split function on input, but the language-specific modules are curried versions that use the globals 2014-07-07 06:21:06 +04:00
* Upd from spacy 2014-07-23 20:35:18 +04:00
* Progress to getting WordTree working. Tests pass, but so far it's slower. 2014-08-16 21:59:38 +04:00			`#cdef inline StringHash hash_string(unicode string, size_t length):`
			`# '''Hash unicode with MurmurHash64A'''`
			`# return hash(string)`
			`# #cdef bytes byte_string = string.encode('utf8')`
			`# #return mrmr.hash32(<char>byte_string, len(byte_string) sizeof(char), 0)`


* Refactor for string view features. Working on setting up flags and enums. 2014-07-07 18:58:48 +04:00			`def get_normalized(unicode lex, size_t length):`
* Upd from spacy 2014-07-23 20:35:18 +04:00			`if lex.isalpha() and lex.islower():`
			`return lex`
			`else:`
			`return get_word_shape(lex, length)`
* Refactor for string view features. Working on setting up flags and enums. 2014-07-07 18:58:48 +04:00

* Restore unicode, work on improving string storage. 2014-08-16 16:35:34 +04:00			`def get_word_shape(unicode lex, length):`
* 710k words per second for counts 2014-07-07 21:12:19 +04:00			`shape = ""`
			`last = ""`
			`shape_char = ""`
			`seq = 0`
			`for c in lex:`
			`if c.isalpha():`
			`if c.isupper():`
			`shape_char = "X"`
			`else:`
			`shape_char = "x"`
			`elif c.isdigit():`
			`shape_char = "d"`
			`else:`
			`shape_char = c`
			`if shape_char == last:`
			`seq += 1`
			`else:`
			`seq = 0`
			`last = shape_char`
			`if seq < 3:`
			`shape += shape_char`
			`assert shape`
			`return shape`

* Refactor for string view features. Working on setting up flags and enums. 2014-07-07 18:58:48 +04:00
			`def set_orth_flags(lex, length):`
			`return 0`


* Reorganized code to accomodate Tokens class. Need string views before group_by and count_by can be done well. 2014-07-07 14:47:21 +04:00			`cdef class Language:`
			`def __cinit__(self, name):`
			`self.name = name`
			`self.bacov = {}`
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 21:14:00 +04:00			`self.chunks = dense_hash_map[StringHash, size_t]()`
			`self.vocab = dense_hash_map[StringHash, size_t]()`
			`self.chunks.set_empty_key(0)`
			`self.vocab.set_empty_key(0)`
* Reorganized code to accomodate Tokens class. Need string views before group_by and count_by can be done well. 2014-07-07 14:47:21 +04:00			`self.load_tokenization(util.read_tokenization(name))`

* Working version, adding improvements 2014-08-18 21:59:59 +04:00			`cdef Tokens tokenize(self, unicode string):`
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 21:14:00 +04:00			`cdef Lexeme** chunk`
* Reorganized code to accomodate Tokens class. Need string views before group_by and count_by can be done well. 2014-07-07 14:47:21 +04:00			`cdef Tokens tokens = Tokens(self)`
* Working version that uses arrays for chunks, which should be more memory efficient 2014-08-18 22:23:54 +04:00			`cdef size_t length = len(string)`
			`cdef Py_UNICODE* characters = <Py_UNICODE*>string`
			`cdef Py_UNICODE c`
* Working version, adding improvements 2014-08-18 21:59:59 +04:00			`cdef size_t start = 0`
			`cdef size_t i`
			`for i in range(length):`
			`c = characters[i]`
			`if _is_whitespace(c):`
			`if start < i:`
			`chunk = self.lookup_chunk(&characters[start], i - start)`
			`_extend(tokens, chunk)`
			`start = i + 1`
			`if start < i:`
			`chunk = self.lookup_chunk(&characters[start], length - start)`
			`_extend(tokens, chunk)`
* Reorganized code to accomodate Tokens class. Need string views before group_by and count_by can be done well. 2014-07-07 14:47:21 +04:00			`return tokens`

* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 21:14:00 +04:00			`cdef Lexeme* lookup(self, unicode string) except NULL:`
			`if len(string) == 0:`
			`return &BLANK_WORD`
			`cdef Lexeme* word = <Lexeme*>self.vocab[hash(string)]`
			`if word == NULL:`
			`word = self.new_lexeme(string)`
			`return word`
* Refactor for string view features. Working on setting up flags and enums. 2014-07-07 18:58:48 +04:00
* Working version that uses arrays for chunks, which should be more memory efficient 2014-08-18 22:23:54 +04:00			`cdef Lexeme** lookup_chunk(self, Py_UNICODE* c_string, size_t length) except NULL:`
			`cdef StringHash h = mrmr.hash32(c_string, length * sizeof(Py_UNICODE), 0)`
* Working version, adding improvements 2014-08-18 21:59:59 +04:00			`cdef Lexeme chunk = <Lexeme>self.chunks[h]`
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 21:14:00 +04:00			`cdef int split`
			`if chunk == NULL:`
* Working version that uses arrays for chunks, which should be more memory efficient 2014-08-18 22:23:54 +04:00			`chunk = self.new_chunk(c_string[:length], self.find_substrings(c_string[:length]))`
* Working version, adding improvements 2014-08-18 21:59:59 +04:00			`self.chunks[h] = <size_t>chunk`
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 21:14:00 +04:00			`return chunk`

			`cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL:`
			`cdef Lexeme chunk = <Lexeme>calloc(len(substrings) + 1, sizeof(Lexeme*))`
			`for i, substring in enumerate(substrings):`
			`chunk[i] = self.lookup(substring)`
			`chunk[i + 1] = NULL`
			`return chunk`

			`cdef Lexeme* new_lexeme(self, unicode string) except NULL:`
* Refactoring tokenizer 2014-08-16 05:22:03 +04:00			`cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))`
* Remove dependence on murmurhash 2014-08-16 19:37:09 +04:00			`word.lex = hash(string)`
* Restore string saving to spacy 2014-08-16 18:09:24 +04:00			`self.bacov[word.lex] = string`
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 21:14:00 +04:00			`word.orth = self.new_orth(string)`
			`word.dist = self.new_dist(string)`
			`self.vocab[word.lex] = <size_t>word`
* WordTree in use. Need to reform the way chunks are handled. Should be properly one Lexeme per word, with split points being the things that are cached. 2014-08-16 22:10:22 +04:00			`return word`
* Refactoring tokenizer 2014-08-16 05:22:03 +04:00
* WordTree in use. Need to reform the way chunks are handled. Should be properly one Lexeme per word, with split points being the things that are cached. 2014-08-16 22:10:22 +04:00			`cdef Orthography* new_orth(self, unicode lex) except NULL:`
* Refactoring tokenizer 2014-08-16 05:22:03 +04:00			`cdef unicode last3`
			`cdef unicode norm`
			`cdef unicode shape`
			`cdef int length`

			`length = len(lex)`
			`orth = <Orthography*>calloc(1, sizeof(Orthography))`
* Restore unicode, work on improving string storage. 2014-08-16 16:35:34 +04:00			`orth.first = lex[0]`
* Refactoring tokenizer 2014-08-16 05:22:03 +04:00
			`orth.length = length`
			`orth.flags = set_orth_flags(lex, orth.length)`
* WordTree in use. Need to reform the way chunks are handled. Should be properly one Lexeme per word, with split points being the things that are cached. 2014-08-16 22:10:22 +04:00			`orth.norm = hash(lex)`
* Restore string saving to spacy 2014-08-16 18:09:24 +04:00			`last3 = substr(lex, length - 3, length, length)`
* Remove dependence on murmurhash 2014-08-16 19:37:09 +04:00			`orth.last3 = hash(last3)`
* Restore string saving to spacy 2014-08-16 18:09:24 +04:00			`norm = get_normalized(lex, length)`
* Remove dependence on murmurhash 2014-08-16 19:37:09 +04:00			`orth.norm = hash(norm)`
* Restore string saving to spacy 2014-08-16 18:09:24 +04:00			`shape = get_word_shape(lex, length)`
* Remove dependence on murmurhash 2014-08-16 19:37:09 +04:00			`orth.shape = hash(shape)`
* Restore string saving to spacy 2014-08-16 18:09:24 +04:00
			`self.bacov[orth.last3] = last3`
			`self.bacov[orth.norm] = norm`
			`self.bacov[orth.shape] = shape`
* Refactor for string view features. Working on setting up flags and enums. 2014-07-07 18:58:48 +04:00
			`return orth`
* Reorganized code to accomodate Tokens class. Need string views before group_by and count_by can be done well. 2014-07-07 14:47:21 +04:00
* WordTree in use. Need to reform the way chunks are handled. Should be properly one Lexeme per word, with split points being the things that are cached. 2014-08-16 22:10:22 +04:00			`cdef Distribution* new_dist(self, unicode lex) except NULL:`
* Refactoring tokenizer 2014-08-16 05:22:03 +04:00			`dist = <Distribution*>calloc(1, sizeof(Distribution))`
			`return dist`

			`cdef unicode unhash(self, StringHash hash_value):`
			`'''Fetch a string from the reverse index, given its hash value.'''`
* Restore unicode, work on improving string storage. 2014-08-16 16:35:34 +04:00			`return self.bacov[hash_value]`
* Refactoring tokenizer 2014-08-16 05:22:03 +04:00
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 21:14:00 +04:00			`cpdef list find_substrings(self, unicode word):`
			`substrings = []`
			`while word:`
			`split = self.find_split(word)`
			`if split == 0:`
			`substrings.append(word)`
			`break`
			`substrings.append(word[:split])`
			`word = word[split:]`
			`return substrings`

			`cdef int find_split(self, unicode word):`
			`return len(word)`
* Refactoring tokenizer 2014-08-16 05:22:03 +04:00
			`def load_tokenization(self, token_rules=None):`
* Working version, adding improvements 2014-08-18 21:59:59 +04:00			`cdef StringHash h`
* Working version that uses arrays for chunks, which should be more memory efficient 2014-08-18 22:23:54 +04:00			`cdef Py_UNICODE* c_string`
* Working version, adding improvements 2014-08-18 21:59:59 +04:00			`cdef bytes byte_string`
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 21:14:00 +04:00			`for chunk, tokens in token_rules:`
* Working version that uses arrays for chunks, which should be more memory efficient 2014-08-18 22:23:54 +04:00			`length = len(chunk)`
			`c_string = <Py_UNICODE*>chunk`
			`h = mrmr.hash32(c_string, length * sizeof(Py_UNICODE), 0)`
* Working version, adding improvements 2014-08-18 21:59:59 +04:00			`self.chunks[h] = <size_t>self.new_chunk(chunk, tokens)`

* Refactoring tokenizer 2014-08-16 05:22:03 +04:00			`def load_clusters(self):`
			`cdef Lexeme* w`
			`data_dir = path.join(path.dirname(__file__), '..', 'data', 'en')`
			`case_stats = util.load_case_stats(data_dir)`
			`brown_loc = path.join(data_dir, 'clusters')`
			`cdef size_t start`
			`cdef int end`
			`with util.utf8open(brown_loc) as browns_file:`
			`for i, line in enumerate(browns_file):`
			`cluster_str, token_string, freq_str = line.split()`
			`# Decode as a little-endian string, so that we can do & 15 to get`
			`# the first 4 bits. See redshift._parse_features.pyx`
			`cluster = int(cluster_str[::-1], 2)`
			`upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0))`
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 21:14:00 +04:00			`self.new_lexeme(token_string)`
* Working version, adding improvements 2014-08-18 21:59:59 +04:00

* Working version that uses arrays for chunks, which should be more memory efficient 2014-08-18 22:23:54 +04:00			`cdef inline bint _is_whitespace(Py_UNICODE c) nogil:`
			`if c == ' ':`
* Working version, adding improvements 2014-08-18 21:59:59 +04:00			`return True`
* Working version that uses arrays for chunks, which should be more memory efficient 2014-08-18 22:23:54 +04:00			`elif c == '\n':`
* Working version, adding improvements 2014-08-18 21:59:59 +04:00			`return True`
* Working version that uses arrays for chunks, which should be more memory efficient 2014-08-18 22:23:54 +04:00			`elif c == '\t':`
* Working version, adding improvements 2014-08-18 21:59:59 +04:00			`return True`
			`else:`
			`return False`


			`cdef int _extend(Tokens tokens, Lexeme** chunk) except -1:`
			`cdef size_t i = 0`
			`while chunk[i] != NULL:`
			`tokens.append(<Lexeme_addr>chunk[i])`
			`i += 1`