spaCy/spacy/lexeme.pyx

from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool

import orth

OOV_DIST_FLAGS = 0


cpdef dict get_lexeme_dict(size_t i, unicode string):
    ints = [None for _ in range(LexInt_N)]
    ints[<int>LexInt_i] = i
    ints[<int>LexInt_length] = len(string)
    ints[<int>LexInt_cluster] = 0
    ints[<int>LexInt_pos] = 0
    ints[<int>LexInt_supersense] = 0
    
    floats = [None for _ in range(LexFloat_N)]
    floats[<int>LexFloat_prob] = 0
    floats[<int>LexFloat_sentiment] = 0

    strings = [None for _ in range(LexStr_N)]
    strings[<int>LexStr_orig] = string
    strings[<int>LexStr_casefix] = strings[<int>LexStr_orig]
    strings[<int>LexStr_shape] = orth.word_shape(string)
    strings[<int>LexStr_unsparse] = strings[<int>LexStr_shape]
    strings[<int>LexStr_asciied] = orth.asciied(string)

    orth_flags = get_orth_flags(string)
    dist_flags = OOV_DIST_FLAGS

    return {'ints': ints, 'floats': floats, 'strings': strings,
            'orth_flags': orth_flags, 'dist_flags': dist_flags}

def get_orth_flags(unicode string):
    cdef flag_t flags = 0

    flags |= orth.is_ascii(string) << LexOrth_ascii
    flags |= orth.is_alpha(string) << LexOrth_alpha
    flags |= orth.is_digit(string) << LexOrth_digit
    flags |= orth.is_lower(string) << LexOrth_lower
    flags |= orth.is_punct(string) << LexOrth_punct
    flags |= orth.is_space(string) << LexOrth_space
    flags |= orth.is_title(string) << LexOrth_title
    flags |= orth.is_upper(string) << LexOrth_upper
    return flags


def get_dist_flags(unicode string):
    return 0


cdef char* intern_and_encode(unicode string, size_t* length) except NULL:
    cdef bytes byte_string = string.encode('utf8')
    cdef bytes utf8_string = intern(byte_string)
    Py_INCREF(utf8_string)
    length[0] = len(utf8_string)
    return <char*>utf8_string


cdef int lexeme_get_int(LexemeC* lexeme, size_t i) except *:
    return lexeme.ints[i]


cdef float lexeme_get_float(LexemeC* lexeme, size_t i) except *:
    return lexeme.floats[i]


cdef unicode lexeme_get_string(LexemeC* lexeme, size_t i):
    cdef bytes byte_string = lexeme.strings[i]
    return byte_string.decode('utf8')


cdef bint lexeme_check_orth_flag(LexemeC* lexeme, size_t flag_id) except *:
    return lexeme.orth_flags & (1 << flag_id)


cdef bint lexeme_check_dist_flag(LexemeC* lexeme, size_t flag_id) except *:
    return lexeme.dist_flags & (1 << flag_id)


cdef dict lexeme_pack(LexemeC* lex):
    cdef dict packed = {}
    packed['ints'] = [lex.ints[i] for i in range(LexInt_N)]
    packed['floats'] = [lex.floats[i] for i in range(LexFloat_N)]
    packed['strings'] = [lex.strings[i].decode('utf8') for i in range(LexStr_N)]
    packed['orth_flags'] = lex.orth_flags
    packed['dist_flags'] = lex.orth_flags
    return packed


cdef int lexeme_unpack(LexemeC* lex, dict p) except -1:
    cdef size_t i
    cdef int lex_int
    cdef float lex_float
    cdef unicode string
    for i, lex_int in enumerate(p['ints']):
        lex.ints[i] = lex_int
    for i, lex_float in enumerate(p['floats']):
        lex.floats[i] = lex_float
    cdef size_t _
    for i, lex_string in enumerate(p['strings']):
        lex.strings[i] = intern_and_encode(lex_string, &_)
    lex.orth_flags = p['orth_flags']
    lex.dist_flags = p['dist_flags']
* Upd Tokens to use vector, with bounds checking. 2014-09-15 05:22:40 +04:00			`from cpython.ref cimport Py_INCREF`
* Switch from own memory class to cymem, in pip 2014-09-18 01:09:24 +04:00			`from cymem.cymem cimport Pool`
* Upd Tokens to use vector, with bounds checking. 2014-09-15 05:22:40 +04:00
* Revising data model of lexeme. Compiles. 2014-10-09 12:53:30 +04:00			`import orth`
* Restoring Lexeme-as-struct 2014-09-10 22:41:37 +04:00
* Revising data model of lexeme. Compiles. 2014-10-09 12:53:30 +04:00			`OOV_DIST_FLAGS = 0`
* Restoring Lexeme-as-struct 2014-09-10 22:41:37 +04:00

* Switch to new data model, tests passing 2014-10-10 01:11:31 +04:00			`cpdef dict get_lexeme_dict(size_t i, unicode string):`
* Revising data model of lexeme. Compiles. 2014-10-09 12:53:30 +04:00			`ints = [None for _ in range(LexInt_N)]`
			`ints[<int>LexInt_i] = i`
			`ints[<int>LexInt_length] = len(string)`
			`ints[<int>LexInt_cluster] = 0`
			`ints[<int>LexInt_pos] = 0`
			`ints[<int>LexInt_supersense] = 0`

			`floats = [None for _ in range(LexFloat_N)]`
			`floats[<int>LexFloat_prob] = 0`
			`floats[<int>LexFloat_sentiment] = 0`
* Restoring Lexeme-as-struct 2014-09-10 22:41:37 +04:00
* Revising data model of lexeme. Compiles. 2014-10-09 12:53:30 +04:00			`strings = [None for _ in range(LexStr_N)]`
* Slight cleaning of tokenizer code 2014-10-10 12:17:22 +04:00			`strings[<int>LexStr_orig] = string`
			`strings[<int>LexStr_casefix] = strings[<int>LexStr_orig]`
* Switch to new data model, tests passing 2014-10-10 01:11:31 +04:00			`strings[<int>LexStr_shape] = orth.word_shape(string)`
* Revising data model of lexeme. Compiles. 2014-10-09 12:53:30 +04:00			`strings[<int>LexStr_unsparse] = strings[<int>LexStr_shape]`
* Switch to new data model, tests passing 2014-10-10 01:11:31 +04:00			`strings[<int>LexStr_asciied] = orth.asciied(string)`
* Revising data model of lexeme. Compiles. 2014-10-09 12:53:30 +04:00
			`orth_flags = get_orth_flags(string)`
			`dist_flags = OOV_DIST_FLAGS`

			`return {'ints': ints, 'floats': floats, 'strings': strings,`
			`'orth_flags': orth_flags, 'dist_flags': dist_flags}`

			`def get_orth_flags(unicode string):`
* Switch to new data model, tests passing 2014-10-10 01:11:31 +04:00			`cdef flag_t flags = 0`

			`flags \|= orth.is_ascii(string) << LexOrth_ascii`
			`flags \|= orth.is_alpha(string) << LexOrth_alpha`
			`flags \|= orth.is_digit(string) << LexOrth_digit`
			`flags \|= orth.is_lower(string) << LexOrth_lower`
			`flags \|= orth.is_punct(string) << LexOrth_punct`
			`flags \|= orth.is_space(string) << LexOrth_space`
			`flags \|= orth.is_title(string) << LexOrth_title`
			`flags \|= orth.is_upper(string) << LexOrth_upper`
			`return flags`
* Revising data model of lexeme. Compiles. 2014-10-09 12:53:30 +04:00
* Slight cleaning of tokenizer code 2014-10-10 12:17:22 +04:00
* Revising data model of lexeme. Compiles. 2014-10-09 12:53:30 +04:00			`def get_dist_flags(unicode string):`
			`return 0`


			`cdef char* intern_and_encode(unicode string, size_t* length) except NULL:`
* Fiddle with the way strings are interned in lexeme 2014-09-15 08:34:45 +04:00			`cdef bytes byte_string = string.encode('utf8')`
			`cdef bytes utf8_string = intern(byte_string)`
* Upd Tokens to use vector, with bounds checking. 2014-09-15 05:22:40 +04:00			`Py_INCREF(utf8_string)`
* Move EnglishTokens stuff to Tokens 2014-09-15 03:31:44 +04:00			`length[0] = len(utf8_string)`
* Restoring Lexeme-as-struct 2014-09-10 22:41:37 +04:00			`return <char*>utf8_string`


* Revising data model of lexeme. Compiles. 2014-10-09 12:53:30 +04:00			`cdef int lexeme_get_int(LexemeC* lexeme, size_t i) except *:`
			`return lexeme.ints[i]`
* Restoring Lexeme-as-struct 2014-09-10 22:41:37 +04:00

* Revising data model of lexeme. Compiles. 2014-10-09 12:53:30 +04:00			`cdef float lexeme_get_float(LexemeC* lexeme, size_t i) except *:`
			`return lexeme.floats[i]`


			`cdef unicode lexeme_get_string(LexemeC* lexeme, size_t i):`
			`cdef bytes byte_string = lexeme.strings[i]`
* Restoring Lexeme-as-struct 2014-09-10 22:41:37 +04:00			`return byte_string.decode('utf8')`
* Add serialize/deserialize functions for lexeme, transport to/from python dict. 2014-10-09 07:10:46 +04:00

* Revising data model of lexeme. Compiles. 2014-10-09 12:53:30 +04:00			`cdef bint lexeme_check_orth_flag(LexemeC* lexeme, size_t flag_id) except *:`
			`return lexeme.orth_flags & (1 << flag_id)`


			`cdef bint lexeme_check_dist_flag(LexemeC* lexeme, size_t flag_id) except *:`
			`return lexeme.dist_flags & (1 << flag_id)`


			`cdef dict lexeme_pack(LexemeC* lex):`
* Add serialize/deserialize functions for lexeme, transport to/from python dict. 2014-10-09 07:10:46 +04:00			`cdef dict packed = {}`
* Revising data model of lexeme. Compiles. 2014-10-09 12:53:30 +04:00			`packed['ints'] = [lex.ints[i] for i in range(LexInt_N)]`
			`packed['floats'] = [lex.floats[i] for i in range(LexFloat_N)]`
			`packed['strings'] = [lex.strings[i].decode('utf8') for i in range(LexStr_N)]`
			`packed['orth_flags'] = lex.orth_flags`
			`packed['dist_flags'] = lex.orth_flags`
* Add serialize/deserialize functions for lexeme, transport to/from python dict. 2014-10-09 07:10:46 +04:00			`return packed`


			`cdef int lexeme_unpack(LexemeC* lex, dict p) except -1:`
* Revising data model of lexeme. Compiles. 2014-10-09 12:53:30 +04:00			`cdef size_t i`
			`cdef int lex_int`
			`cdef float lex_float`
			`cdef unicode string`
			`for i, lex_int in enumerate(p['ints']):`
			`lex.ints[i] = lex_int`
			`for i, lex_float in enumerate(p['floats']):`
* Switch to new data model, tests passing 2014-10-10 01:11:31 +04:00			`lex.floats[i] = lex_float`
* Revising data model of lexeme. Compiles. 2014-10-09 12:53:30 +04:00			`cdef size_t _`
			`for i, lex_string in enumerate(p['strings']):`
			`lex.strings[i] = intern_and_encode(lex_string, &_)`
			`lex.orth_flags = p['orth_flags']`
* Switch to new data model, tests passing 2014-10-10 01:11:31 +04:00			`lex.dist_flags = p['dist_flags']`