* Make flag-setting a language-specific thing

This commit is contained in:
Matthew Honnibal 2014-12-03 11:04:00 +11:00
parent 71b009e323
commit b463a7eb86
6 changed files with 224 additions and 146 deletions

View File

@ -1,6 +1,32 @@
from spacy.lang cimport Language from spacy.lang cimport Language
from spacy.tokens cimport Tokens from spacy.tokens cimport Tokens
# Flags
cpdef enum FlagID:
IS_ALPHA
IS_ASCII
IS_DIGIT
IS_LOWER
IS_PUNCT
IS_SPACE
IS_TITLE
IS_UPPER
LIKE_URL
LIKE_NUMBER
OFT_LOWER
OFT_TITLE
OFT_UPPER
IN_MALES
IN_FEMALES
IN_SURNAMES
IN_PLACES
IN_GAMES
IN_CELEBS
IN_NAMES
cdef class English(Language): cdef class English(Language):
pass pass

View File

@ -38,6 +38,8 @@ provides a fully Penn Treebank 3-compliant tokenizer.
from __future__ import unicode_literals from __future__ import unicode_literals
cimport lang cimport lang
from .typedefs cimport flags_t
import orth
cdef class English(Language): cdef class English(Language):
@ -47,7 +49,20 @@ cdef class English(Language):
name (unicode): The two letter code used by Wikipedia for the language. name (unicode): The two letter code used by Wikipedia for the language.
lexicon (Lexicon): The lexicon. Exposes the lookup method. lexicon (Lexicon): The lexicon. Exposes the lookup method.
""" """
pass def set_flags(self, unicode string):
cdef flags_t flags = 0
flags |= orth.is_alpha(string) << IS_ALPHA
flags |= orth.is_ascii(string) << IS_ASCII
flags |= orth.is_digit(string) << IS_DIGIT
flags |= orth.is_lower(string) << IS_LOWER
flags |= orth.is_punct(string) << IS_PUNCT
flags |= orth.is_space(string) << IS_SPACE
flags |= orth.is_title(string) << IS_TITLE
flags |= orth.is_upper(string) << IS_UPPER
flags |= orth.like_url(string) << LIKE_URL
flags |= orth.like_number(string) << LIKE_NUMBER
return flags
EN = English('en') EN = English('en')

View File

@ -8,23 +8,17 @@ from cymem.cymem cimport Pool
from .typedefs cimport hash_t from .typedefs cimport hash_t
from .tokens cimport Tokens from .tokens cimport Tokens
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
from .tagger cimport Tagger from .utf8string cimport StringStore, UniStr
from .utf8string cimport StringStore
cdef struct String:
Py_UNICODE* chars
size_t n
hash_t key
cdef class Lexicon: cdef class Lexicon:
cpdef public set_flags
cdef Pool mem cdef Pool mem
cpdef readonly size_t size cpdef readonly size_t size
cpdef readonly StringStore strings cpdef readonly StringStore strings
cdef vector[Lexeme*] lexemes cdef vector[Lexeme*] lexemes
cdef Lexeme* get(self, String* s) except NULL cdef Lexeme* get(self, UniStr* s) except NULL
cdef PreshMap _map cdef PreshMap _map
@ -43,10 +37,10 @@ cdef class Language:
cpdef Tokens tokens_from_list(self, list strings) cpdef Tokens tokens_from_list(self, list strings)
cpdef Tokens tokenize(self, unicode text) cpdef Tokens tokenize(self, unicode text)
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1 cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes, cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
vector[Lexeme*] *suffixes) except NULL vector[Lexeme*] *suffixes) except NULL
cdef int _attach_tokens(self, Tokens tokens, int idx, String* string, cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except -1 vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except -1
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1

View File

@ -19,6 +19,8 @@ from .lexeme cimport Lexeme
from .lexeme cimport EMPTY_LEXEME from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport init as lexeme_init from .lexeme cimport init as lexeme_init
from .utf8string cimport slice_unicode
from . import util from . import util
from .util import read_lang_data from .util import read_lang_data
from .tokens import Tokens from .tokens import Tokens
@ -34,7 +36,7 @@ cdef class Language:
self._prefix_re = re.compile(prefix) self._prefix_re = re.compile(prefix)
self._suffix_re = re.compile(suffix) self._suffix_re = re.compile(suffix)
self._infix_re = re.compile(infix) self._infix_re = re.compile(infix)
self.lexicon = Lexicon() self.lexicon = Lexicon(self.set_flags)
if path.exists(path.join(util.DATA_DIR, name, 'lexemes')): if path.exists(path.join(util.DATA_DIR, name, 'lexemes')):
self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes')) self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes'))
self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings')) self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
@ -45,11 +47,11 @@ cdef class Language:
cdef Tokens tokens = Tokens(self.lexicon.strings, length) cdef Tokens tokens = Tokens(self.lexicon.strings, length)
if length == 0: if length == 0:
return tokens return tokens
cdef String string_struct cdef UniStr string_struct
cdef unicode py_string cdef unicode py_string
cdef int idx = 0 cdef int idx = 0
for i, py_string in enumerate(strings): for i, py_string in enumerate(strings):
string_from_unicode(&string_struct, py_string) slice_unicode(&string_struct, py_string, 0, len(py_string))
tokens.push_back(idx, self.lexicon.get(&string_struct)) tokens.push_back(idx, self.lexicon.get(&string_struct))
idx += len(py_string) + 1 idx += len(py_string) + 1
return tokens return tokens
@ -77,11 +79,11 @@ cdef class Language:
cdef int start = 0 cdef int start = 0
cdef Py_UNICODE* chars = string cdef Py_UNICODE* chars = string
cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0]) cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
cdef String span cdef UniStr span
for i in range(1, length): for i in range(1, length):
if Py_UNICODE_ISSPACE(chars[i]) != in_ws: if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
if start < i: if start < i:
string_slice(&span, chars, start, i) slice_unicode(&span, chars, start, i)
lexemes = <Lexeme**>self._cache.get(span.key) lexemes = <Lexeme**>self._cache.get(span.key)
if lexemes != NULL: if lexemes != NULL:
tokens.extend(start, lexemes, 0) tokens.extend(start, lexemes, 0)
@ -93,7 +95,7 @@ cdef class Language:
start += 1 start += 1
i += 1 i += 1
if start < i: if start < i:
string_slice(&span, chars, start, i) slice_unicode(&span, chars, start, i)
lexemes = <Lexeme**>self._cache.get(span.key) lexemes = <Lexeme**>self._cache.get(span.key)
if lexemes != NULL: if lexemes != NULL:
tokens.extend(start, lexemes, 0) tokens.extend(start, lexemes, 0)
@ -101,7 +103,7 @@ cdef class Language:
self._tokenize(tokens, &span, start, i) self._tokenize(tokens, &span, start, i)
return tokens return tokens
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1: cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1:
cdef vector[Lexeme*] prefixes cdef vector[Lexeme*] prefixes
cdef vector[Lexeme*] suffixes cdef vector[Lexeme*] suffixes
cdef hash_t orig_key cdef hash_t orig_key
@ -112,20 +114,20 @@ cdef class Language:
self._attach_tokens(tokens, start, span, &prefixes, &suffixes) self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
self._save_cached(&tokens.lex[orig_size], orig_key, tokens.length - orig_size) self._save_cached(&tokens.lex[orig_size], orig_key, tokens.length - orig_size)
cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes, cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
vector[Lexeme*] *suffixes) except NULL: vector[Lexeme*] *suffixes) except NULL:
cdef size_t i cdef size_t i
cdef String prefix cdef UniStr prefix
cdef String suffix cdef UniStr suffix
cdef String minus_pre cdef UniStr minus_pre
cdef String minus_suf cdef UniStr minus_suf
cdef size_t last_size = 0 cdef size_t last_size = 0
while string.n != 0 and string.n != last_size: while string.n != 0 and string.n != last_size:
last_size = string.n last_size = string.n
pre_len = self._find_prefix(string.chars, string.n) pre_len = self._find_prefix(string.chars, string.n)
if pre_len != 0: if pre_len != 0:
string_slice(&prefix, string.chars, 0, pre_len) slice_unicode(&prefix, string.chars, 0, pre_len)
string_slice(&minus_pre, string.chars, pre_len, string.n) slice_unicode(&minus_pre, string.chars, pre_len, string.n)
# Check whether we've hit a special-case # Check whether we've hit a special-case
if minus_pre.n >= 1 and self._specials.get(minus_pre.key) != NULL: if minus_pre.n >= 1 and self._specials.get(minus_pre.key) != NULL:
string[0] = minus_pre string[0] = minus_pre
@ -133,15 +135,15 @@ cdef class Language:
break break
suf_len = self._find_suffix(string.chars, string.n) suf_len = self._find_suffix(string.chars, string.n)
if suf_len != 0: if suf_len != 0:
string_slice(&suffix, string.chars, string.n - suf_len, string.n) slice_unicode(&suffix, string.chars, string.n - suf_len, string.n)
string_slice(&minus_suf, string.chars, 0, string.n - suf_len) slice_unicode(&minus_suf, string.chars, 0, string.n - suf_len)
# Check whether we've hit a special-case # Check whether we've hit a special-case
if minus_suf.n >= 1 and self._specials.get(minus_suf.key) != NULL: if minus_suf.n >= 1 and self._specials.get(minus_suf.key) != NULL:
string[0] = minus_suf string[0] = minus_suf
suffixes.push_back(self.lexicon.get(&suffix)) suffixes.push_back(self.lexicon.get(&suffix))
break break
if pre_len and suf_len and (pre_len + suf_len) <= string.n: if pre_len and suf_len and (pre_len + suf_len) <= string.n:
string_slice(string, string.chars, pre_len, string.n - suf_len) slice_unicode(string, string.chars, pre_len, string.n - suf_len)
prefixes.push_back(self.lexicon.get(&prefix)) prefixes.push_back(self.lexicon.get(&prefix))
suffixes.push_back(self.lexicon.get(&suffix)) suffixes.push_back(self.lexicon.get(&suffix))
elif pre_len: elif pre_len:
@ -155,13 +157,13 @@ cdef class Language:
return string return string
cdef int _attach_tokens(self, Tokens tokens, cdef int _attach_tokens(self, Tokens tokens,
int idx, String* string, int idx, UniStr* string,
vector[Lexeme*] *prefixes, vector[Lexeme*] *prefixes,
vector[Lexeme*] *suffixes) except -1: vector[Lexeme*] *suffixes) except -1:
cdef int split cdef int split
cdef Lexeme** lexemes cdef Lexeme** lexemes
cdef Lexeme* lexeme cdef Lexeme* lexeme
cdef String span cdef UniStr span
if prefixes.size(): if prefixes.size():
idx = tokens.extend(idx, prefixes.data(), prefixes.size()) idx = tokens.extend(idx, prefixes.data(), prefixes.size())
if string.n != 0: if string.n != 0:
@ -174,11 +176,11 @@ cdef class Language:
if split == 0 or split == -1: if split == 0 or split == -1:
idx = tokens.push_back(idx, self.lexicon.get(string)) idx = tokens.push_back(idx, self.lexicon.get(string))
else: else:
string_slice(&span, string.chars, 0, split) slice_unicode(&span, string.chars, 0, split)
idx = tokens.push_back(idx, self.lexicon.get(&span)) idx = tokens.push_back(idx, self.lexicon.get(&span))
string_slice(&span, string.chars, split, split+1) slice_unicode(&span, string.chars, split, split+1)
idx = tokens.push_back(idx, self.lexicon.get(&span)) idx = tokens.push_back(idx, self.lexicon.get(&span))
string_slice(&span, string.chars, split + 1, string.n) slice_unicode(&span, string.chars, split + 1, string.n)
idx = tokens.push_back(idx, self.lexicon.get(&span)) idx = tokens.push_back(idx, self.lexicon.get(&span))
cdef vector[Lexeme*].reverse_iterator it = suffixes.rbegin() cdef vector[Lexeme*].reverse_iterator it = suffixes.rbegin()
while it != suffixes.rend(): while it != suffixes.rend():
@ -222,14 +224,14 @@ cdef class Language:
''' '''
cdef Lexeme** lexemes cdef Lexeme** lexemes
cdef hash_t hashed cdef hash_t hashed
cdef String string cdef UniStr string
for uni_string, substrings in token_rules: for uni_string, substrings in token_rules:
lexemes = <Lexeme**>self.mem.alloc(len(substrings) + 1, sizeof(Lexeme*)) lexemes = <Lexeme**>self.mem.alloc(len(substrings) + 1, sizeof(Lexeme*))
for i, substring in enumerate(substrings): for i, substring in enumerate(substrings):
string_from_unicode(&string, substring) slice_unicode(&string, substring, 0, len(substring))
lexemes[i] = <Lexeme*>self.lexicon.get(&string) lexemes[i] = <Lexeme*>self.lexicon.get(&string)
lexemes[i + 1] = NULL lexemes[i + 1] = NULL
string_from_unicode(&string, uni_string) slice_unicode(&string, uni_string, 0, len(uni_string))
self._specials.set(string.key, lexemes) self._specials.set(string.key, lexemes)
self._cache.set(string.key, lexemes) self._cache.set(string.key, lexemes)
@ -239,21 +241,23 @@ cdef class Lexicon:
Also interns UTF-8 strings, and maps them to consecutive integer IDs. Also interns UTF-8 strings, and maps them to consecutive integer IDs.
''' '''
def __init__(self): def __init__(self, object set_flags=None):
self.mem = Pool() self.mem = Pool()
self._map = PreshMap(2 ** 20) self._map = PreshMap(2 ** 20)
self.strings = StringStore() self.strings = StringStore()
self.lexemes.push_back(&EMPTY_LEXEME) self.lexemes.push_back(&EMPTY_LEXEME)
self.size = 1 self.size = 1
self.set_flags = set_flags
cdef Lexeme* get(self, String* string) except NULL: cdef Lexeme* get(self, UniStr* string) except NULL:
'''Retrieve a pointer to a Lexeme from the lexicon.''' '''Retrieve a pointer to a Lexeme from the lexicon.'''
cdef Lexeme* lex cdef Lexeme* lex
lex = <Lexeme*>self._map.get(string.key) lex = <Lexeme*>self._map.get(string.key)
if lex != NULL: if lex != NULL:
return lex return lex
lex = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1) lex = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key, self.strings, {}) lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key,
self.strings, {'flags': self.set_flags(string.chars[:string.n])})
self._map.set(string.key, lex) self._map.set(string.key, lex)
while self.lexemes.size() < (lex.id + 1): while self.lexemes.size() < (lex.id + 1):
self.lexemes.push_back(&EMPTY_LEXEME) self.lexemes.push_back(&EMPTY_LEXEME)
@ -283,14 +287,14 @@ cdef class Lexicon:
''' '''
if type(id_or_string) == int: if type(id_or_string) == int:
return self.lexemes.at(id_or_string)[0] return self.lexemes.at(id_or_string)[0]
cdef String string cdef UniStr string
string_from_unicode(&string, id_or_string) slice_unicode(&string, id_or_string, 0, len(id_or_string))
cdef Lexeme* lexeme = self.get(&string) cdef Lexeme* lexeme = self.get(&string)
return lexeme[0] return lexeme[0]
def __setitem__(self, unicode uni_string, dict props): def __setitem__(self, unicode uni_string, dict props):
cdef String s cdef UniStr s
string_from_unicode(&s, uni_string) slice_unicode(&s, uni_string, 0, len(uni_string))
cdef Lexeme* lex = self.get(&s) cdef Lexeme* lex = self.get(&s)
lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props) lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props)
@ -338,14 +342,3 @@ cdef class Lexicon:
i += 1 i += 1
self.size += 1 self.size += 1
fclose(fp) fclose(fp)
cdef void string_from_unicode(String* s, unicode uni):
cdef Py_UNICODE* c_uni = <Py_UNICODE*>uni
string_slice(s, c_uni, 0, len(uni))
cdef inline void string_slice(String* s, Py_UNICODE* chars, int start, int end) nogil:
s.chars = &chars[start]
s.n = end - start
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)

View File

@ -1,61 +1,119 @@
from .typedefs cimport hash_t, utf8_t, flag_t, id_t, len_t, tag_t from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t
from .utf8string cimport StringStore from .utf8string cimport StringStore
from libc.stdint cimport uint16_t
cpdef flag_t OOV_DIST_FLAGS
# Flags # Reserve 64 values for flag features
cpdef enum: cpdef enum attr_id_t:
IS_ALPHA FLAG0
IS_ASCII FLAG1
IS_DIGIT FLAG2
IS_LOWER FLAG3
IS_PUNCT FLAG4
IS_SPACE FLAG5
IS_TITLE FLAG6
IS_UPPER FLAG7
FLAG8
FLAG9
FLAG10
FLAG11
FLAG12
FLAG13
FLAG14
FLAG15
FLAG16
FLAG17
FLAG18
FLAG19
FLAG20
FLAG21
FLAG22
FLAG23
FLAG24
FLAG25
FLAG26
FLAG27
FLAG28
FLAG29
FLAG30
FLAG31
FLAG32
FLAG33
FLAG34
FLAG35
FLAG36
FLAG37
FLAG38
FLAG39
FLAG40
FLAG41
FLAG42
FLAG43
FLAG44
FLAG45
FLAG46
FLAG47
FLAG48
FLAG49
FLAG50
FLAG51
FLAG52
FLAG53
FLAG54
FLAG55
FLAG56
FLAG57
FLAG58
FLAG59
FLAG60
FLAG61
FLAG62
FLAG63
LIKE_URL ID
LIKE_NUMBER SIC
NORM
SHAPE
ASCIIED
PREFIX
SUFFIX
OFT_LOWER LENGTH
OFT_TITLE CLUSTER
OFT_UPPER POS_TYPE
SENSE_TYPE
IN_MALES
IN_FEMALES
IN_SURNAMES
IN_PLACES
IN_GAMES
IN_CELEBS
IN_NAMES
cdef struct Lexeme: cdef struct Lexeme:
flag_t flags flags_t flags
id_t id attr_t id
id_t sic attr_t sic
id_t norm attr_t norm
id_t shape attr_t shape
id_t asciied attr_t asciied
id_t prefix attr_t prefix
id_t suffix attr_t suffix
attr_t length
attr_t cluster
attr_t pos_type
attr_t sense_type
float prob float prob
float upper_pc
len_t length float title_pc
tag_t cluster
tag_t postype
tag_t supersense
cdef Lexeme EMPTY_LEXEME cdef Lexeme EMPTY_LEXEME
cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
StringStore store, dict props) except * cpdef Lexeme init(id_t i, unicode string, hash_t hashed, StringStore store,
dict props) except *
cdef inline bint check_flag(Lexeme* lexeme, size_t flag_id) nogil: cdef inline bint check_flag(Lexeme* lexeme, attr_id_t flag_id) nogil:
return lexeme.flags & (1 << flag_id) return lexeme.flags & (1 << flag_id)
cdef attr_t get_attr(Lexeme* lex, attr_id_t attr_id)

View File

@ -6,67 +6,59 @@ from libc.string cimport memset
import orth import orth
from .utf8string cimport Utf8Str
OOV_DIST_FLAGS = 0
memset(&EMPTY_LEXEME, 0, sizeof(Lexeme)) memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))
def get_flags(unicode string, float upper_pc, float title_pc, float lower_pc):
cdef flag_t flags = 0
flags |= orth.is_alpha(string) << IS_ALPHA
flags |= orth.is_ascii(string) << IS_ASCII
flags |= orth.is_digit(string) << IS_DIGIT
flags |= orth.is_lower(string) << IS_LOWER
flags |= orth.is_punct(string) << IS_PUNCT
flags |= orth.is_space(string) << IS_SPACE
flags |= orth.is_title(string) << IS_TITLE
flags |= orth.is_upper(string) << IS_UPPER
flags |= orth.like_url(string) << LIKE_URL
flags |= orth.like_number(string) << LIKE_NUMBER
return flags
cpdef Lexeme init(id_t i, unicode string, hash_t hashed, cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
StringStore store, dict props) except *: StringStore string_store, dict props) except *:
cdef Lexeme lex cdef Lexeme lex
lex.id = i lex.id = i
lex.length = len(string) lex.length = len(string)
lex.sic = get_string_id(string, store) lex.sic = string_store[string]
lex.cluster = props.get('cluster', 0) lex.cluster = props.get('cluster', 0)
lex.postype = props.get('postype', 0) lex.pos_type = props.get('pos_type', 0)
lex.supersense = props.get('supersense', 0) lex.sense_type = props.get('sense_type', 0)
lex.prob = props.get('prob', 0) lex.prob = props.get('prob', 0)
cdef float upper_pc = props.get('upper_pc', 0.0) lex.upper_pc = props.get('upper_pc', 0.0)
cdef float lower_pc = props.get('lower_pc', 0.0) lex.title_pc = props.get('lower_pc', 0.0)
cdef float title_pc = props.get('title_pc', 0.0)
lex.prefix = get_string_id(string[0], store) lex.prefix = string_store[string[:1]]
lex.suffix = get_string_id(string[-3:], store) lex.suffix = string_store[string[-3:]]
if upper_pc or lower_pc or title_pc: lex.norm = lex.sic # TODO
canon_cased = orth.canon_case(string, upper_pc, title_pc, lower_pc) lex.shape = string_store[orth.word_shape(string)]
lex.norm = get_string_id(canon_cased, store) lex.asciied = string_store[orth.asciied(string)]
else:
lex.norm = lex.sic
lex.shape = get_string_id(orth.word_shape(string), store)
lex.asciied = get_string_id(orth.asciied(string), store)
lex.flags = get_flags(string, upper_pc, title_pc, lower_pc)
lex.flags |= props.get('in_males', 0) << IN_MALES lex.flags = props.get('flags', 0)
lex.flags |= props.get('in_females', 0) << IN_FEMALES
lex.flags |= props.get('in_surnames', 0) << IN_SURNAMES
lex.flags |= props.get('in_places', 0) << IN_PLACES
lex.flags |= props.get('in_celebs', 0) << IN_CELEBS
lex.flags |= props.get('in_games', 0) << IN_GAMES
lex.flags |= props.get('in_names', 0) << IN_NAMES
return lex return lex
cdef id_t get_string_id(unicode string, StringStore store) except 0: cdef attr_t get_attr(Lexeme* lex, attr_id_t feat_name):
cdef bytes byte_string = string.encode('utf8') if feat_name < (sizeof(flags_t) * 8):
cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string)) return check_flag(lex, feat_name)
return orig_str.i elif feat_name == ID:
return lex.id
elif feat_name == SIC:
return lex.sic
elif feat_name == NORM:
return lex.norm
elif feat_name == SHAPE:
return lex.shape
elif feat_name == ASCIIED:
return lex.asciied
elif feat_name == PREFIX:
return lex.prefix
elif feat_name == SUFFIX:
return lex.suffix
elif feat_name == LENGTH:
return lex.length
elif feat_name == CLUSTER:
return lex.cluster
elif feat_name == POS_TYPE:
return lex.pos_type
elif feat_name == SENSE_TYPE:
return lex.sense_type
else:
raise StandardError('Feature ID: %d not found' % feat_name)