spaCy/spacy/lexeme.pyx

from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool
from murmurhash.mrmr cimport hash64

from libc.string cimport memset

import orth

from .utf8string cimport Utf8Str

OOV_DIST_FLAGS = 0

memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))


def get_flags(unicode string, float upper_pc, float title_pc, float lower_pc):
    cdef flag_t flags = 0
    flags |= orth.is_alpha(string) << IS_ALPHA
    flags |= orth.is_ascii(string) << IS_ASCII
    flags |= orth.is_digit(string) << IS_DIGIT
    flags |= orth.is_lower(string) << IS_LOWER
    flags |= orth.is_punct(string) << IS_PUNCT
    flags |= orth.is_space(string) << IS_SPACE
    flags |= orth.is_title(string) << IS_TITLE
    flags |= orth.is_upper(string) << IS_UPPER

    flags |= orth.like_url(string) << LIKE_URL
    flags |= orth.like_number(string) << LIKE_NUMBER
    return flags


cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
                  StringStore store, dict props) except *:
    cdef Lexeme lex
    lex.id = i
    lex.length = len(string)
    lex.sic = get_string_id(string, store)
    
    lex.cluster = props.get('cluster', 0)
    lex.postype = props.get('postype', 0)
    lex.supersense = props.get('supersense', 0)
    lex.prob = props.get('prob', 0)

    cdef float upper_pc = props.get('upper_pc', 0.0)
    cdef float lower_pc = props.get('lower_pc', 0.0)
    cdef float title_pc = props.get('title_pc', 0.0)

    lex.prefix = get_string_id(string[0], store)
    lex.suffix = get_string_id(string[-3:], store)
    if upper_pc or lower_pc or title_pc:
        canon_cased = orth.canon_case(string, upper_pc, title_pc, lower_pc)
        lex.norm = get_string_id(canon_cased, store)
    else:
        lex.norm = lex.sic
    lex.shape = get_string_id(orth.word_shape(string), store)
    lex.asciied = get_string_id(orth.asciied(string), store)
    non_sparse = orth.non_sparse(string, lex.prob, lex.cluster, upper_pc, title_pc, lower_pc)
    lex.vocab10k = get_string_id(non_sparse, store)
    lex.flags = get_flags(string, upper_pc, title_pc, lower_pc)
    return lex


cdef id_t get_string_id(unicode string, StringStore store) except 0:
    cdef bytes byte_string = string.encode('utf8')
    cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string))
    return orig_str.i
* Upd Tokens to use vector, with bounds checking. 2014-09-15 05:22:40 +04:00			`from cpython.ref cimport Py_INCREF`
* Switch from own memory class to cymem, in pip 2014-09-18 01:09:24 +04:00			`from cymem.cymem cimport Pool`
* Rewriting Lexeme serialization. 2014-10-29 15:19:38 +03:00			`from murmurhash.mrmr cimport hash64`
* Upd Tokens to use vector, with bounds checking. 2014-09-15 05:22:40 +04:00
* Remove the feature array stuff from Tokens class, and replace vector with array-based implementation, with padding. 2014-10-22 18:57:59 +04:00			`from libc.string cimport memset`

* Revising data model of lexeme. Compiles. 2014-10-09 12:53:30 +04:00			`import orth`
* Restoring Lexeme-as-struct 2014-09-10 22:41:37 +04:00
* Large refactor, particularly to Python API 2014-10-23 17:59:17 +04:00			`from .utf8string cimport Utf8Str`
* Restoring Lexeme-as-struct 2014-09-10 22:41:37 +04:00
* Large refactor, particularly to Python API 2014-10-23 17:59:17 +04:00			`OOV_DIST_FLAGS = 0`
* Revising data model of lexeme. Compiles. 2014-10-09 12:53:30 +04:00
* Large refactor, particularly to Python API 2014-10-23 17:59:17 +04:00			`memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))`
* Revising data model of lexeme. Compiles. 2014-10-09 12:53:30 +04:00

* Rewriting Lexeme serialization. 2014-10-29 15:19:38 +03:00			`def get_flags(unicode string, float upper_pc, float title_pc, float lower_pc):`
* Switch to new data model, tests passing 2014-10-10 01:11:31 +04:00			`cdef flag_t flags = 0`
* Large refactor, particularly to Python API 2014-10-23 17:59:17 +04:00			`flags \|= orth.is_alpha(string) << IS_ALPHA`
			`flags \|= orth.is_ascii(string) << IS_ASCII`
			`flags \|= orth.is_digit(string) << IS_DIGIT`
			`flags \|= orth.is_lower(string) << IS_LOWER`
			`flags \|= orth.is_punct(string) << IS_PUNCT`
			`flags \|= orth.is_space(string) << IS_SPACE`
			`flags \|= orth.is_title(string) << IS_TITLE`
			`flags \|= orth.is_upper(string) << IS_UPPER`
* Add LIKE_URL and LIKE_NUMBER flag features 2014-11-02 05:19:05 +03:00
			`flags \|= orth.like_url(string) << LIKE_URL`
			`flags \|= orth.like_number(string) << LIKE_NUMBER`
* Switch to new data model, tests passing 2014-10-10 01:11:31 +04:00			`return flags`
* Revising data model of lexeme. Compiles. 2014-10-09 12:53:30 +04:00
* Slight cleaning of tokenizer code 2014-10-10 12:17:22 +04:00
* Restore id attribute to lexeme, and rename pos field to postype, to store clustered tag dictionaries 2014-10-31 09:43:00 +03:00			`cpdef Lexeme init(id_t i, unicode string, hash_t hashed,`
* Rewriting Lexeme serialization. 2014-10-29 15:19:38 +03:00			`StringStore store, dict props) except *:`
			`cdef Lexeme lex`
* Restore id attribute to lexeme, and rename pos field to postype, to store clustered tag dictionaries 2014-10-31 09:43:00 +03:00			`lex.id = i`
* Rewriting Lexeme serialization. 2014-10-29 15:19:38 +03:00			`lex.length = len(string)`
			`lex.sic = get_string_id(string, store)`

			`lex.cluster = props.get('cluster', 0)`
* Restore id attribute to lexeme, and rename pos field to postype, to store clustered tag dictionaries 2014-10-31 09:43:00 +03:00			`lex.postype = props.get('postype', 0)`
* Rewriting Lexeme serialization. 2014-10-29 15:19:38 +03:00			`lex.supersense = props.get('supersense', 0)`
			`lex.prob = props.get('prob', 0)`

			`cdef float upper_pc = props.get('upper_pc', 0.0)`
			`cdef float lower_pc = props.get('lower_pc', 0.0)`
			`cdef float title_pc = props.get('title_pc', 0.0)`

			`lex.prefix = get_string_id(string[0], store)`
			`lex.suffix = get_string_id(string[-3:], store)`
* Small efficiency tweak to lexeme init 2014-10-30 09:56:11 +03:00			`if upper_pc or lower_pc or title_pc:`
			`canon_cased = orth.canon_case(string, upper_pc, title_pc, lower_pc)`
			`lex.norm = get_string_id(canon_cased, store)`
			`else:`
			`lex.norm = lex.sic`
* Rewriting Lexeme serialization. 2014-10-29 15:19:38 +03:00			`lex.shape = get_string_id(orth.word_shape(string), store)`
			`lex.asciied = get_string_id(orth.asciied(string), store)`
			`non_sparse = orth.non_sparse(string, lex.prob, lex.cluster, upper_pc, title_pc, lower_pc)`
			`lex.vocab10k = get_string_id(non_sparse, store)`
			`lex.flags = get_flags(string, upper_pc, title_pc, lower_pc)`
			`return lex`

* Restore id attribute to lexeme, and rename pos field to postype, to store clustered tag dictionaries 2014-10-31 09:43:00 +03:00
* Fiddle with data types on Lexeme, to compress them to a much smaller size. 2014-10-30 07:42:15 +03:00			`cdef id_t get_string_id(unicode string, StringStore store) except 0:`
* Fiddle with the way strings are interned in lexeme 2014-09-15 08:34:45 +04:00			`cdef bytes byte_string = string.encode('utf8')`
* Large refactor, particularly to Python API 2014-10-23 17:59:17 +04:00			`cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string))`
* Rewriting Lexeme serialization. 2014-10-29 15:19:38 +03:00			`return orig_str.i`