mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 10:46:29 +03:00
* Make flag-setting a language-specific thing
This commit is contained in:
parent
71b009e323
commit
b463a7eb86
26
spacy/en.pxd
26
spacy/en.pxd
|
@ -1,6 +1,32 @@
|
|||
from spacy.lang cimport Language
|
||||
from spacy.tokens cimport Tokens
|
||||
|
||||
# Flags
|
||||
cpdef enum FlagID:
|
||||
IS_ALPHA
|
||||
IS_ASCII
|
||||
IS_DIGIT
|
||||
IS_LOWER
|
||||
IS_PUNCT
|
||||
IS_SPACE
|
||||
IS_TITLE
|
||||
IS_UPPER
|
||||
|
||||
LIKE_URL
|
||||
LIKE_NUMBER
|
||||
|
||||
OFT_LOWER
|
||||
OFT_TITLE
|
||||
OFT_UPPER
|
||||
|
||||
IN_MALES
|
||||
IN_FEMALES
|
||||
IN_SURNAMES
|
||||
IN_PLACES
|
||||
IN_GAMES
|
||||
IN_CELEBS
|
||||
IN_NAMES
|
||||
|
||||
|
||||
cdef class English(Language):
|
||||
pass
|
||||
|
|
17
spacy/en.pyx
17
spacy/en.pyx
|
@ -38,6 +38,8 @@ provides a fully Penn Treebank 3-compliant tokenizer.
|
|||
from __future__ import unicode_literals
|
||||
|
||||
cimport lang
|
||||
from .typedefs cimport flags_t
|
||||
import orth
|
||||
|
||||
|
||||
cdef class English(Language):
|
||||
|
@ -47,7 +49,20 @@ cdef class English(Language):
|
|||
name (unicode): The two letter code used by Wikipedia for the language.
|
||||
lexicon (Lexicon): The lexicon. Exposes the lookup method.
|
||||
"""
|
||||
pass
|
||||
def set_flags(self, unicode string):
|
||||
cdef flags_t flags = 0
|
||||
flags |= orth.is_alpha(string) << IS_ALPHA
|
||||
flags |= orth.is_ascii(string) << IS_ASCII
|
||||
flags |= orth.is_digit(string) << IS_DIGIT
|
||||
flags |= orth.is_lower(string) << IS_LOWER
|
||||
flags |= orth.is_punct(string) << IS_PUNCT
|
||||
flags |= orth.is_space(string) << IS_SPACE
|
||||
flags |= orth.is_title(string) << IS_TITLE
|
||||
flags |= orth.is_upper(string) << IS_UPPER
|
||||
|
||||
flags |= orth.like_url(string) << LIKE_URL
|
||||
flags |= orth.like_number(string) << LIKE_NUMBER
|
||||
return flags
|
||||
|
||||
|
||||
EN = English('en')
|
||||
|
|
|
@ -8,23 +8,17 @@ from cymem.cymem cimport Pool
|
|||
from .typedefs cimport hash_t
|
||||
from .tokens cimport Tokens
|
||||
from .lexeme cimport Lexeme
|
||||
from .tagger cimport Tagger
|
||||
from .utf8string cimport StringStore
|
||||
|
||||
|
||||
cdef struct String:
|
||||
Py_UNICODE* chars
|
||||
size_t n
|
||||
hash_t key
|
||||
from .utf8string cimport StringStore, UniStr
|
||||
|
||||
|
||||
cdef class Lexicon:
|
||||
cpdef public set_flags
|
||||
cdef Pool mem
|
||||
cpdef readonly size_t size
|
||||
cpdef readonly StringStore strings
|
||||
cdef vector[Lexeme*] lexemes
|
||||
|
||||
cdef Lexeme* get(self, String* s) except NULL
|
||||
cdef Lexeme* get(self, UniStr* s) except NULL
|
||||
|
||||
cdef PreshMap _map
|
||||
|
||||
|
@ -43,10 +37,10 @@ cdef class Language:
|
|||
cpdef Tokens tokens_from_list(self, list strings)
|
||||
cpdef Tokens tokenize(self, unicode text)
|
||||
|
||||
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1
|
||||
cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
|
||||
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
|
||||
cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
|
||||
vector[Lexeme*] *suffixes) except NULL
|
||||
cdef int _attach_tokens(self, Tokens tokens, int idx, String* string,
|
||||
cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
|
||||
vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except -1
|
||||
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
|
||||
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
|
||||
|
|
|
@ -19,6 +19,8 @@ from .lexeme cimport Lexeme
|
|||
from .lexeme cimport EMPTY_LEXEME
|
||||
from .lexeme cimport init as lexeme_init
|
||||
|
||||
from .utf8string cimport slice_unicode
|
||||
|
||||
from . import util
|
||||
from .util import read_lang_data
|
||||
from .tokens import Tokens
|
||||
|
@ -34,7 +36,7 @@ cdef class Language:
|
|||
self._prefix_re = re.compile(prefix)
|
||||
self._suffix_re = re.compile(suffix)
|
||||
self._infix_re = re.compile(infix)
|
||||
self.lexicon = Lexicon()
|
||||
self.lexicon = Lexicon(self.set_flags)
|
||||
if path.exists(path.join(util.DATA_DIR, name, 'lexemes')):
|
||||
self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes'))
|
||||
self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
|
||||
|
@ -45,11 +47,11 @@ cdef class Language:
|
|||
cdef Tokens tokens = Tokens(self.lexicon.strings, length)
|
||||
if length == 0:
|
||||
return tokens
|
||||
cdef String string_struct
|
||||
cdef UniStr string_struct
|
||||
cdef unicode py_string
|
||||
cdef int idx = 0
|
||||
for i, py_string in enumerate(strings):
|
||||
string_from_unicode(&string_struct, py_string)
|
||||
slice_unicode(&string_struct, py_string, 0, len(py_string))
|
||||
tokens.push_back(idx, self.lexicon.get(&string_struct))
|
||||
idx += len(py_string) + 1
|
||||
return tokens
|
||||
|
@ -77,11 +79,11 @@ cdef class Language:
|
|||
cdef int start = 0
|
||||
cdef Py_UNICODE* chars = string
|
||||
cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
|
||||
cdef String span
|
||||
cdef UniStr span
|
||||
for i in range(1, length):
|
||||
if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
|
||||
if start < i:
|
||||
string_slice(&span, chars, start, i)
|
||||
slice_unicode(&span, chars, start, i)
|
||||
lexemes = <Lexeme**>self._cache.get(span.key)
|
||||
if lexemes != NULL:
|
||||
tokens.extend(start, lexemes, 0)
|
||||
|
@ -93,7 +95,7 @@ cdef class Language:
|
|||
start += 1
|
||||
i += 1
|
||||
if start < i:
|
||||
string_slice(&span, chars, start, i)
|
||||
slice_unicode(&span, chars, start, i)
|
||||
lexemes = <Lexeme**>self._cache.get(span.key)
|
||||
if lexemes != NULL:
|
||||
tokens.extend(start, lexemes, 0)
|
||||
|
@ -101,7 +103,7 @@ cdef class Language:
|
|||
self._tokenize(tokens, &span, start, i)
|
||||
return tokens
|
||||
|
||||
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1:
|
||||
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1:
|
||||
cdef vector[Lexeme*] prefixes
|
||||
cdef vector[Lexeme*] suffixes
|
||||
cdef hash_t orig_key
|
||||
|
@ -112,20 +114,20 @@ cdef class Language:
|
|||
self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
|
||||
self._save_cached(&tokens.lex[orig_size], orig_key, tokens.length - orig_size)
|
||||
|
||||
cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
|
||||
cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
|
||||
vector[Lexeme*] *suffixes) except NULL:
|
||||
cdef size_t i
|
||||
cdef String prefix
|
||||
cdef String suffix
|
||||
cdef String minus_pre
|
||||
cdef String minus_suf
|
||||
cdef UniStr prefix
|
||||
cdef UniStr suffix
|
||||
cdef UniStr minus_pre
|
||||
cdef UniStr minus_suf
|
||||
cdef size_t last_size = 0
|
||||
while string.n != 0 and string.n != last_size:
|
||||
last_size = string.n
|
||||
pre_len = self._find_prefix(string.chars, string.n)
|
||||
if pre_len != 0:
|
||||
string_slice(&prefix, string.chars, 0, pre_len)
|
||||
string_slice(&minus_pre, string.chars, pre_len, string.n)
|
||||
slice_unicode(&prefix, string.chars, 0, pre_len)
|
||||
slice_unicode(&minus_pre, string.chars, pre_len, string.n)
|
||||
# Check whether we've hit a special-case
|
||||
if minus_pre.n >= 1 and self._specials.get(minus_pre.key) != NULL:
|
||||
string[0] = minus_pre
|
||||
|
@ -133,15 +135,15 @@ cdef class Language:
|
|||
break
|
||||
suf_len = self._find_suffix(string.chars, string.n)
|
||||
if suf_len != 0:
|
||||
string_slice(&suffix, string.chars, string.n - suf_len, string.n)
|
||||
string_slice(&minus_suf, string.chars, 0, string.n - suf_len)
|
||||
slice_unicode(&suffix, string.chars, string.n - suf_len, string.n)
|
||||
slice_unicode(&minus_suf, string.chars, 0, string.n - suf_len)
|
||||
# Check whether we've hit a special-case
|
||||
if minus_suf.n >= 1 and self._specials.get(minus_suf.key) != NULL:
|
||||
string[0] = minus_suf
|
||||
suffixes.push_back(self.lexicon.get(&suffix))
|
||||
break
|
||||
if pre_len and suf_len and (pre_len + suf_len) <= string.n:
|
||||
string_slice(string, string.chars, pre_len, string.n - suf_len)
|
||||
slice_unicode(string, string.chars, pre_len, string.n - suf_len)
|
||||
prefixes.push_back(self.lexicon.get(&prefix))
|
||||
suffixes.push_back(self.lexicon.get(&suffix))
|
||||
elif pre_len:
|
||||
|
@ -155,13 +157,13 @@ cdef class Language:
|
|||
return string
|
||||
|
||||
cdef int _attach_tokens(self, Tokens tokens,
|
||||
int idx, String* string,
|
||||
int idx, UniStr* string,
|
||||
vector[Lexeme*] *prefixes,
|
||||
vector[Lexeme*] *suffixes) except -1:
|
||||
cdef int split
|
||||
cdef Lexeme** lexemes
|
||||
cdef Lexeme* lexeme
|
||||
cdef String span
|
||||
cdef UniStr span
|
||||
if prefixes.size():
|
||||
idx = tokens.extend(idx, prefixes.data(), prefixes.size())
|
||||
if string.n != 0:
|
||||
|
@ -174,11 +176,11 @@ cdef class Language:
|
|||
if split == 0 or split == -1:
|
||||
idx = tokens.push_back(idx, self.lexicon.get(string))
|
||||
else:
|
||||
string_slice(&span, string.chars, 0, split)
|
||||
slice_unicode(&span, string.chars, 0, split)
|
||||
idx = tokens.push_back(idx, self.lexicon.get(&span))
|
||||
string_slice(&span, string.chars, split, split+1)
|
||||
slice_unicode(&span, string.chars, split, split+1)
|
||||
idx = tokens.push_back(idx, self.lexicon.get(&span))
|
||||
string_slice(&span, string.chars, split + 1, string.n)
|
||||
slice_unicode(&span, string.chars, split + 1, string.n)
|
||||
idx = tokens.push_back(idx, self.lexicon.get(&span))
|
||||
cdef vector[Lexeme*].reverse_iterator it = suffixes.rbegin()
|
||||
while it != suffixes.rend():
|
||||
|
@ -222,14 +224,14 @@ cdef class Language:
|
|||
'''
|
||||
cdef Lexeme** lexemes
|
||||
cdef hash_t hashed
|
||||
cdef String string
|
||||
cdef UniStr string
|
||||
for uni_string, substrings in token_rules:
|
||||
lexemes = <Lexeme**>self.mem.alloc(len(substrings) + 1, sizeof(Lexeme*))
|
||||
for i, substring in enumerate(substrings):
|
||||
string_from_unicode(&string, substring)
|
||||
slice_unicode(&string, substring, 0, len(substring))
|
||||
lexemes[i] = <Lexeme*>self.lexicon.get(&string)
|
||||
lexemes[i + 1] = NULL
|
||||
string_from_unicode(&string, uni_string)
|
||||
slice_unicode(&string, uni_string, 0, len(uni_string))
|
||||
self._specials.set(string.key, lexemes)
|
||||
self._cache.set(string.key, lexemes)
|
||||
|
||||
|
@ -239,21 +241,23 @@ cdef class Lexicon:
|
|||
|
||||
Also interns UTF-8 strings, and maps them to consecutive integer IDs.
|
||||
'''
|
||||
def __init__(self):
|
||||
def __init__(self, object set_flags=None):
|
||||
self.mem = Pool()
|
||||
self._map = PreshMap(2 ** 20)
|
||||
self.strings = StringStore()
|
||||
self.lexemes.push_back(&EMPTY_LEXEME)
|
||||
self.size = 1
|
||||
self.set_flags = set_flags
|
||||
|
||||
cdef Lexeme* get(self, String* string) except NULL:
|
||||
cdef Lexeme* get(self, UniStr* string) except NULL:
|
||||
'''Retrieve a pointer to a Lexeme from the lexicon.'''
|
||||
cdef Lexeme* lex
|
||||
lex = <Lexeme*>self._map.get(string.key)
|
||||
if lex != NULL:
|
||||
return lex
|
||||
lex = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
|
||||
lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key, self.strings, {})
|
||||
lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key,
|
||||
self.strings, {'flags': self.set_flags(string.chars[:string.n])})
|
||||
self._map.set(string.key, lex)
|
||||
while self.lexemes.size() < (lex.id + 1):
|
||||
self.lexemes.push_back(&EMPTY_LEXEME)
|
||||
|
@ -283,14 +287,14 @@ cdef class Lexicon:
|
|||
'''
|
||||
if type(id_or_string) == int:
|
||||
return self.lexemes.at(id_or_string)[0]
|
||||
cdef String string
|
||||
string_from_unicode(&string, id_or_string)
|
||||
cdef UniStr string
|
||||
slice_unicode(&string, id_or_string, 0, len(id_or_string))
|
||||
cdef Lexeme* lexeme = self.get(&string)
|
||||
return lexeme[0]
|
||||
|
||||
def __setitem__(self, unicode uni_string, dict props):
|
||||
cdef String s
|
||||
string_from_unicode(&s, uni_string)
|
||||
cdef UniStr s
|
||||
slice_unicode(&s, uni_string, 0, len(uni_string))
|
||||
cdef Lexeme* lex = self.get(&s)
|
||||
lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props)
|
||||
|
||||
|
@ -338,14 +342,3 @@ cdef class Lexicon:
|
|||
i += 1
|
||||
self.size += 1
|
||||
fclose(fp)
|
||||
|
||||
|
||||
cdef void string_from_unicode(String* s, unicode uni):
|
||||
cdef Py_UNICODE* c_uni = <Py_UNICODE*>uni
|
||||
string_slice(s, c_uni, 0, len(uni))
|
||||
|
||||
|
||||
cdef inline void string_slice(String* s, Py_UNICODE* chars, int start, int end) nogil:
|
||||
s.chars = &chars[start]
|
||||
s.n = end - start
|
||||
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
|
||||
|
|
142
spacy/lexeme.pxd
142
spacy/lexeme.pxd
|
@ -1,61 +1,119 @@
|
|||
from .typedefs cimport hash_t, utf8_t, flag_t, id_t, len_t, tag_t
|
||||
from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t
|
||||
|
||||
from .utf8string cimport StringStore
|
||||
from libc.stdint cimport uint16_t
|
||||
|
||||
cpdef flag_t OOV_DIST_FLAGS
|
||||
|
||||
# Flags
|
||||
cpdef enum:
|
||||
IS_ALPHA
|
||||
IS_ASCII
|
||||
IS_DIGIT
|
||||
IS_LOWER
|
||||
IS_PUNCT
|
||||
IS_SPACE
|
||||
IS_TITLE
|
||||
IS_UPPER
|
||||
# Reserve 64 values for flag features
|
||||
cpdef enum attr_id_t:
|
||||
FLAG0
|
||||
FLAG1
|
||||
FLAG2
|
||||
FLAG3
|
||||
FLAG4
|
||||
FLAG5
|
||||
FLAG6
|
||||
FLAG7
|
||||
FLAG8
|
||||
FLAG9
|
||||
FLAG10
|
||||
FLAG11
|
||||
FLAG12
|
||||
FLAG13
|
||||
FLAG14
|
||||
FLAG15
|
||||
FLAG16
|
||||
FLAG17
|
||||
FLAG18
|
||||
FLAG19
|
||||
FLAG20
|
||||
FLAG21
|
||||
FLAG22
|
||||
FLAG23
|
||||
FLAG24
|
||||
FLAG25
|
||||
FLAG26
|
||||
FLAG27
|
||||
FLAG28
|
||||
FLAG29
|
||||
FLAG30
|
||||
FLAG31
|
||||
FLAG32
|
||||
FLAG33
|
||||
FLAG34
|
||||
FLAG35
|
||||
FLAG36
|
||||
FLAG37
|
||||
FLAG38
|
||||
FLAG39
|
||||
FLAG40
|
||||
FLAG41
|
||||
FLAG42
|
||||
FLAG43
|
||||
FLAG44
|
||||
FLAG45
|
||||
FLAG46
|
||||
FLAG47
|
||||
FLAG48
|
||||
FLAG49
|
||||
FLAG50
|
||||
FLAG51
|
||||
FLAG52
|
||||
FLAG53
|
||||
FLAG54
|
||||
FLAG55
|
||||
FLAG56
|
||||
FLAG57
|
||||
FLAG58
|
||||
FLAG59
|
||||
FLAG60
|
||||
FLAG61
|
||||
FLAG62
|
||||
FLAG63
|
||||
|
||||
LIKE_URL
|
||||
LIKE_NUMBER
|
||||
ID
|
||||
SIC
|
||||
NORM
|
||||
SHAPE
|
||||
ASCIIED
|
||||
PREFIX
|
||||
SUFFIX
|
||||
|
||||
OFT_LOWER
|
||||
OFT_TITLE
|
||||
OFT_UPPER
|
||||
|
||||
IN_MALES
|
||||
IN_FEMALES
|
||||
IN_SURNAMES
|
||||
IN_PLACES
|
||||
IN_GAMES
|
||||
IN_CELEBS
|
||||
IN_NAMES
|
||||
LENGTH
|
||||
CLUSTER
|
||||
POS_TYPE
|
||||
SENSE_TYPE
|
||||
|
||||
|
||||
cdef struct Lexeme:
|
||||
flag_t flags
|
||||
flags_t flags
|
||||
|
||||
id_t id
|
||||
id_t sic
|
||||
id_t norm
|
||||
id_t shape
|
||||
id_t asciied
|
||||
id_t prefix
|
||||
id_t suffix
|
||||
attr_t id
|
||||
attr_t sic
|
||||
attr_t norm
|
||||
attr_t shape
|
||||
attr_t asciied
|
||||
attr_t prefix
|
||||
attr_t suffix
|
||||
|
||||
attr_t length
|
||||
attr_t cluster
|
||||
attr_t pos_type
|
||||
attr_t sense_type
|
||||
|
||||
float prob
|
||||
|
||||
len_t length
|
||||
tag_t cluster
|
||||
tag_t postype
|
||||
tag_t supersense
|
||||
float upper_pc
|
||||
float title_pc
|
||||
|
||||
|
||||
cdef Lexeme EMPTY_LEXEME
|
||||
|
||||
cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
|
||||
StringStore store, dict props) except *
|
||||
|
||||
cpdef Lexeme init(id_t i, unicode string, hash_t hashed, StringStore store,
|
||||
dict props) except *
|
||||
|
||||
|
||||
cdef inline bint check_flag(Lexeme* lexeme, size_t flag_id) nogil:
|
||||
cdef inline bint check_flag(Lexeme* lexeme, attr_id_t flag_id) nogil:
|
||||
return lexeme.flags & (1 << flag_id)
|
||||
|
||||
|
||||
cdef attr_t get_attr(Lexeme* lex, attr_id_t attr_id)
|
||||
|
|
|
@ -6,67 +6,59 @@ from libc.string cimport memset
|
|||
|
||||
import orth
|
||||
|
||||
from .utf8string cimport Utf8Str
|
||||
|
||||
OOV_DIST_FLAGS = 0
|
||||
|
||||
memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))
|
||||
|
||||
|
||||
def get_flags(unicode string, float upper_pc, float title_pc, float lower_pc):
|
||||
cdef flag_t flags = 0
|
||||
flags |= orth.is_alpha(string) << IS_ALPHA
|
||||
flags |= orth.is_ascii(string) << IS_ASCII
|
||||
flags |= orth.is_digit(string) << IS_DIGIT
|
||||
flags |= orth.is_lower(string) << IS_LOWER
|
||||
flags |= orth.is_punct(string) << IS_PUNCT
|
||||
flags |= orth.is_space(string) << IS_SPACE
|
||||
flags |= orth.is_title(string) << IS_TITLE
|
||||
flags |= orth.is_upper(string) << IS_UPPER
|
||||
|
||||
flags |= orth.like_url(string) << LIKE_URL
|
||||
flags |= orth.like_number(string) << LIKE_NUMBER
|
||||
return flags
|
||||
|
||||
|
||||
cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
|
||||
StringStore store, dict props) except *:
|
||||
StringStore string_store, dict props) except *:
|
||||
cdef Lexeme lex
|
||||
lex.id = i
|
||||
lex.length = len(string)
|
||||
lex.sic = get_string_id(string, store)
|
||||
lex.sic = string_store[string]
|
||||
|
||||
lex.cluster = props.get('cluster', 0)
|
||||
lex.postype = props.get('postype', 0)
|
||||
lex.supersense = props.get('supersense', 0)
|
||||
lex.pos_type = props.get('pos_type', 0)
|
||||
lex.sense_type = props.get('sense_type', 0)
|
||||
lex.prob = props.get('prob', 0)
|
||||
|
||||
cdef float upper_pc = props.get('upper_pc', 0.0)
|
||||
cdef float lower_pc = props.get('lower_pc', 0.0)
|
||||
cdef float title_pc = props.get('title_pc', 0.0)
|
||||
lex.upper_pc = props.get('upper_pc', 0.0)
|
||||
lex.title_pc = props.get('lower_pc', 0.0)
|
||||
|
||||
lex.prefix = get_string_id(string[0], store)
|
||||
lex.suffix = get_string_id(string[-3:], store)
|
||||
if upper_pc or lower_pc or title_pc:
|
||||
canon_cased = orth.canon_case(string, upper_pc, title_pc, lower_pc)
|
||||
lex.norm = get_string_id(canon_cased, store)
|
||||
else:
|
||||
lex.norm = lex.sic
|
||||
lex.shape = get_string_id(orth.word_shape(string), store)
|
||||
lex.asciied = get_string_id(orth.asciied(string), store)
|
||||
lex.flags = get_flags(string, upper_pc, title_pc, lower_pc)
|
||||
|
||||
lex.flags |= props.get('in_males', 0) << IN_MALES
|
||||
lex.flags |= props.get('in_females', 0) << IN_FEMALES
|
||||
lex.flags |= props.get('in_surnames', 0) << IN_SURNAMES
|
||||
lex.flags |= props.get('in_places', 0) << IN_PLACES
|
||||
lex.flags |= props.get('in_celebs', 0) << IN_CELEBS
|
||||
lex.flags |= props.get('in_games', 0) << IN_GAMES
|
||||
lex.flags |= props.get('in_names', 0) << IN_NAMES
|
||||
lex.prefix = string_store[string[:1]]
|
||||
lex.suffix = string_store[string[-3:]]
|
||||
lex.norm = lex.sic # TODO
|
||||
lex.shape = string_store[orth.word_shape(string)]
|
||||
lex.asciied = string_store[orth.asciied(string)]
|
||||
|
||||
lex.flags = props.get('flags', 0)
|
||||
return lex
|
||||
|
||||
|
||||
cdef id_t get_string_id(unicode string, StringStore store) except 0:
|
||||
cdef bytes byte_string = string.encode('utf8')
|
||||
cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string))
|
||||
return orig_str.i
|
||||
cdef attr_t get_attr(Lexeme* lex, attr_id_t feat_name):
|
||||
if feat_name < (sizeof(flags_t) * 8):
|
||||
return check_flag(lex, feat_name)
|
||||
elif feat_name == ID:
|
||||
return lex.id
|
||||
elif feat_name == SIC:
|
||||
return lex.sic
|
||||
elif feat_name == NORM:
|
||||
return lex.norm
|
||||
elif feat_name == SHAPE:
|
||||
return lex.shape
|
||||
elif feat_name == ASCIIED:
|
||||
return lex.asciied
|
||||
elif feat_name == PREFIX:
|
||||
return lex.prefix
|
||||
elif feat_name == SUFFIX:
|
||||
return lex.suffix
|
||||
elif feat_name == LENGTH:
|
||||
return lex.length
|
||||
elif feat_name == CLUSTER:
|
||||
return lex.cluster
|
||||
elif feat_name == POS_TYPE:
|
||||
return lex.pos_type
|
||||
elif feat_name == SENSE_TYPE:
|
||||
return lex.sense_type
|
||||
else:
|
||||
raise StandardError('Feature ID: %d not found' % feat_name)
|
||||
|
|
Loading…
Reference in New Issue
Block a user