* Make flag-setting a language-specific thing

This commit is contained in:
Matthew Honnibal 2014-12-03 11:04:00 +11:00
parent 71b009e323
commit b463a7eb86
6 changed files with 224 additions and 146 deletions

View File

@ -1,6 +1,32 @@
from spacy.lang cimport Language
from spacy.tokens cimport Tokens
# Flags
cpdef enum FlagID:
IS_ALPHA
IS_ASCII
IS_DIGIT
IS_LOWER
IS_PUNCT
IS_SPACE
IS_TITLE
IS_UPPER
LIKE_URL
LIKE_NUMBER
OFT_LOWER
OFT_TITLE
OFT_UPPER
IN_MALES
IN_FEMALES
IN_SURNAMES
IN_PLACES
IN_GAMES
IN_CELEBS
IN_NAMES
cdef class English(Language):
pass

View File

@ -38,6 +38,8 @@ provides a fully Penn Treebank 3-compliant tokenizer.
from __future__ import unicode_literals
cimport lang
from .typedefs cimport flags_t
import orth
cdef class English(Language):
@ -47,7 +49,20 @@ cdef class English(Language):
name (unicode): The two letter code used by Wikipedia for the language.
lexicon (Lexicon): The lexicon. Exposes the lookup method.
"""
pass
def set_flags(self, unicode string):
cdef flags_t flags = 0
flags |= orth.is_alpha(string) << IS_ALPHA
flags |= orth.is_ascii(string) << IS_ASCII
flags |= orth.is_digit(string) << IS_DIGIT
flags |= orth.is_lower(string) << IS_LOWER
flags |= orth.is_punct(string) << IS_PUNCT
flags |= orth.is_space(string) << IS_SPACE
flags |= orth.is_title(string) << IS_TITLE
flags |= orth.is_upper(string) << IS_UPPER
flags |= orth.like_url(string) << LIKE_URL
flags |= orth.like_number(string) << LIKE_NUMBER
return flags
EN = English('en')

View File

@ -8,23 +8,17 @@ from cymem.cymem cimport Pool
from .typedefs cimport hash_t
from .tokens cimport Tokens
from .lexeme cimport Lexeme
from .tagger cimport Tagger
from .utf8string cimport StringStore
cdef struct String:
Py_UNICODE* chars
size_t n
hash_t key
from .utf8string cimport StringStore, UniStr
cdef class Lexicon:
cpdef public set_flags
cdef Pool mem
cpdef readonly size_t size
cpdef readonly StringStore strings
cdef vector[Lexeme*] lexemes
cdef Lexeme* get(self, String* s) except NULL
cdef Lexeme* get(self, UniStr* s) except NULL
cdef PreshMap _map
@ -43,10 +37,10 @@ cdef class Language:
cpdef Tokens tokens_from_list(self, list strings)
cpdef Tokens tokenize(self, unicode text)
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1
cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
vector[Lexeme*] *suffixes) except NULL
cdef int _attach_tokens(self, Tokens tokens, int idx, String* string,
cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except -1
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1

View File

@ -19,6 +19,8 @@ from .lexeme cimport Lexeme
from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport init as lexeme_init
from .utf8string cimport slice_unicode
from . import util
from .util import read_lang_data
from .tokens import Tokens
@ -34,7 +36,7 @@ cdef class Language:
self._prefix_re = re.compile(prefix)
self._suffix_re = re.compile(suffix)
self._infix_re = re.compile(infix)
self.lexicon = Lexicon()
self.lexicon = Lexicon(self.set_flags)
if path.exists(path.join(util.DATA_DIR, name, 'lexemes')):
self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes'))
self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
@ -45,11 +47,11 @@ cdef class Language:
cdef Tokens tokens = Tokens(self.lexicon.strings, length)
if length == 0:
return tokens
cdef String string_struct
cdef UniStr string_struct
cdef unicode py_string
cdef int idx = 0
for i, py_string in enumerate(strings):
string_from_unicode(&string_struct, py_string)
slice_unicode(&string_struct, py_string, 0, len(py_string))
tokens.push_back(idx, self.lexicon.get(&string_struct))
idx += len(py_string) + 1
return tokens
@ -77,11 +79,11 @@ cdef class Language:
cdef int start = 0
cdef Py_UNICODE* chars = string
cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
cdef String span
cdef UniStr span
for i in range(1, length):
if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
if start < i:
string_slice(&span, chars, start, i)
slice_unicode(&span, chars, start, i)
lexemes = <Lexeme**>self._cache.get(span.key)
if lexemes != NULL:
tokens.extend(start, lexemes, 0)
@ -93,7 +95,7 @@ cdef class Language:
start += 1
i += 1
if start < i:
string_slice(&span, chars, start, i)
slice_unicode(&span, chars, start, i)
lexemes = <Lexeme**>self._cache.get(span.key)
if lexemes != NULL:
tokens.extend(start, lexemes, 0)
@ -101,7 +103,7 @@ cdef class Language:
self._tokenize(tokens, &span, start, i)
return tokens
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1:
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1:
cdef vector[Lexeme*] prefixes
cdef vector[Lexeme*] suffixes
cdef hash_t orig_key
@ -112,20 +114,20 @@ cdef class Language:
self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
self._save_cached(&tokens.lex[orig_size], orig_key, tokens.length - orig_size)
cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
vector[Lexeme*] *suffixes) except NULL:
cdef size_t i
cdef String prefix
cdef String suffix
cdef String minus_pre
cdef String minus_suf
cdef UniStr prefix
cdef UniStr suffix
cdef UniStr minus_pre
cdef UniStr minus_suf
cdef size_t last_size = 0
while string.n != 0 and string.n != last_size:
last_size = string.n
pre_len = self._find_prefix(string.chars, string.n)
if pre_len != 0:
string_slice(&prefix, string.chars, 0, pre_len)
string_slice(&minus_pre, string.chars, pre_len, string.n)
slice_unicode(&prefix, string.chars, 0, pre_len)
slice_unicode(&minus_pre, string.chars, pre_len, string.n)
# Check whether we've hit a special-case
if minus_pre.n >= 1 and self._specials.get(minus_pre.key) != NULL:
string[0] = minus_pre
@ -133,15 +135,15 @@ cdef class Language:
break
suf_len = self._find_suffix(string.chars, string.n)
if suf_len != 0:
string_slice(&suffix, string.chars, string.n - suf_len, string.n)
string_slice(&minus_suf, string.chars, 0, string.n - suf_len)
slice_unicode(&suffix, string.chars, string.n - suf_len, string.n)
slice_unicode(&minus_suf, string.chars, 0, string.n - suf_len)
# Check whether we've hit a special-case
if minus_suf.n >= 1 and self._specials.get(minus_suf.key) != NULL:
string[0] = minus_suf
suffixes.push_back(self.lexicon.get(&suffix))
break
if pre_len and suf_len and (pre_len + suf_len) <= string.n:
string_slice(string, string.chars, pre_len, string.n - suf_len)
slice_unicode(string, string.chars, pre_len, string.n - suf_len)
prefixes.push_back(self.lexicon.get(&prefix))
suffixes.push_back(self.lexicon.get(&suffix))
elif pre_len:
@ -155,13 +157,13 @@ cdef class Language:
return string
cdef int _attach_tokens(self, Tokens tokens,
int idx, String* string,
int idx, UniStr* string,
vector[Lexeme*] *prefixes,
vector[Lexeme*] *suffixes) except -1:
cdef int split
cdef Lexeme** lexemes
cdef Lexeme* lexeme
cdef String span
cdef UniStr span
if prefixes.size():
idx = tokens.extend(idx, prefixes.data(), prefixes.size())
if string.n != 0:
@ -174,11 +176,11 @@ cdef class Language:
if split == 0 or split == -1:
idx = tokens.push_back(idx, self.lexicon.get(string))
else:
string_slice(&span, string.chars, 0, split)
slice_unicode(&span, string.chars, 0, split)
idx = tokens.push_back(idx, self.lexicon.get(&span))
string_slice(&span, string.chars, split, split+1)
slice_unicode(&span, string.chars, split, split+1)
idx = tokens.push_back(idx, self.lexicon.get(&span))
string_slice(&span, string.chars, split + 1, string.n)
slice_unicode(&span, string.chars, split + 1, string.n)
idx = tokens.push_back(idx, self.lexicon.get(&span))
cdef vector[Lexeme*].reverse_iterator it = suffixes.rbegin()
while it != suffixes.rend():
@ -222,14 +224,14 @@ cdef class Language:
'''
cdef Lexeme** lexemes
cdef hash_t hashed
cdef String string
cdef UniStr string
for uni_string, substrings in token_rules:
lexemes = <Lexeme**>self.mem.alloc(len(substrings) + 1, sizeof(Lexeme*))
for i, substring in enumerate(substrings):
string_from_unicode(&string, substring)
slice_unicode(&string, substring, 0, len(substring))
lexemes[i] = <Lexeme*>self.lexicon.get(&string)
lexemes[i + 1] = NULL
string_from_unicode(&string, uni_string)
slice_unicode(&string, uni_string, 0, len(uni_string))
self._specials.set(string.key, lexemes)
self._cache.set(string.key, lexemes)
@ -239,21 +241,23 @@ cdef class Lexicon:
Also interns UTF-8 strings, and maps them to consecutive integer IDs.
'''
def __init__(self):
def __init__(self, object set_flags=None):
self.mem = Pool()
self._map = PreshMap(2 ** 20)
self.strings = StringStore()
self.lexemes.push_back(&EMPTY_LEXEME)
self.size = 1
self.set_flags = set_flags
cdef Lexeme* get(self, String* string) except NULL:
cdef Lexeme* get(self, UniStr* string) except NULL:
'''Retrieve a pointer to a Lexeme from the lexicon.'''
cdef Lexeme* lex
lex = <Lexeme*>self._map.get(string.key)
if lex != NULL:
return lex
lex = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key, self.strings, {})
lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key,
self.strings, {'flags': self.set_flags(string.chars[:string.n])})
self._map.set(string.key, lex)
while self.lexemes.size() < (lex.id + 1):
self.lexemes.push_back(&EMPTY_LEXEME)
@ -283,14 +287,14 @@ cdef class Lexicon:
'''
if type(id_or_string) == int:
return self.lexemes.at(id_or_string)[0]
cdef String string
string_from_unicode(&string, id_or_string)
cdef UniStr string
slice_unicode(&string, id_or_string, 0, len(id_or_string))
cdef Lexeme* lexeme = self.get(&string)
return lexeme[0]
def __setitem__(self, unicode uni_string, dict props):
cdef String s
string_from_unicode(&s, uni_string)
cdef UniStr s
slice_unicode(&s, uni_string, 0, len(uni_string))
cdef Lexeme* lex = self.get(&s)
lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props)
@ -338,14 +342,3 @@ cdef class Lexicon:
i += 1
self.size += 1
fclose(fp)
cdef void string_from_unicode(String* s, unicode uni):
cdef Py_UNICODE* c_uni = <Py_UNICODE*>uni
string_slice(s, c_uni, 0, len(uni))
cdef inline void string_slice(String* s, Py_UNICODE* chars, int start, int end) nogil:
s.chars = &chars[start]
s.n = end - start
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)

View File

@ -1,61 +1,119 @@
from .typedefs cimport hash_t, utf8_t, flag_t, id_t, len_t, tag_t
from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t
from .utf8string cimport StringStore
from libc.stdint cimport uint16_t
cpdef flag_t OOV_DIST_FLAGS
# Flags
cpdef enum:
IS_ALPHA
IS_ASCII
IS_DIGIT
IS_LOWER
IS_PUNCT
IS_SPACE
IS_TITLE
IS_UPPER
# Reserve 64 values for flag features
cpdef enum attr_id_t:
FLAG0
FLAG1
FLAG2
FLAG3
FLAG4
FLAG5
FLAG6
FLAG7
FLAG8
FLAG9
FLAG10
FLAG11
FLAG12
FLAG13
FLAG14
FLAG15
FLAG16
FLAG17
FLAG18
FLAG19
FLAG20
FLAG21
FLAG22
FLAG23
FLAG24
FLAG25
FLAG26
FLAG27
FLAG28
FLAG29
FLAG30
FLAG31
FLAG32
FLAG33
FLAG34
FLAG35
FLAG36
FLAG37
FLAG38
FLAG39
FLAG40
FLAG41
FLAG42
FLAG43
FLAG44
FLAG45
FLAG46
FLAG47
FLAG48
FLAG49
FLAG50
FLAG51
FLAG52
FLAG53
FLAG54
FLAG55
FLAG56
FLAG57
FLAG58
FLAG59
FLAG60
FLAG61
FLAG62
FLAG63
LIKE_URL
LIKE_NUMBER
ID
SIC
NORM
SHAPE
ASCIIED
PREFIX
SUFFIX
OFT_LOWER
OFT_TITLE
OFT_UPPER
IN_MALES
IN_FEMALES
IN_SURNAMES
IN_PLACES
IN_GAMES
IN_CELEBS
IN_NAMES
LENGTH
CLUSTER
POS_TYPE
SENSE_TYPE
cdef struct Lexeme:
flag_t flags
flags_t flags
id_t id
id_t sic
id_t norm
id_t shape
id_t asciied
id_t prefix
id_t suffix
attr_t id
attr_t sic
attr_t norm
attr_t shape
attr_t asciied
attr_t prefix
attr_t suffix
attr_t length
attr_t cluster
attr_t pos_type
attr_t sense_type
float prob
len_t length
tag_t cluster
tag_t postype
tag_t supersense
float upper_pc
float title_pc
cdef Lexeme EMPTY_LEXEME
cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
StringStore store, dict props) except *
cpdef Lexeme init(id_t i, unicode string, hash_t hashed, StringStore store,
dict props) except *
cdef inline bint check_flag(Lexeme* lexeme, size_t flag_id) nogil:
cdef inline bint check_flag(Lexeme* lexeme, attr_id_t flag_id) nogil:
return lexeme.flags & (1 << flag_id)
cdef attr_t get_attr(Lexeme* lex, attr_id_t attr_id)

View File

@ -6,67 +6,59 @@ from libc.string cimport memset
import orth
from .utf8string cimport Utf8Str
OOV_DIST_FLAGS = 0
memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))
def get_flags(unicode string, float upper_pc, float title_pc, float lower_pc):
cdef flag_t flags = 0
flags |= orth.is_alpha(string) << IS_ALPHA
flags |= orth.is_ascii(string) << IS_ASCII
flags |= orth.is_digit(string) << IS_DIGIT
flags |= orth.is_lower(string) << IS_LOWER
flags |= orth.is_punct(string) << IS_PUNCT
flags |= orth.is_space(string) << IS_SPACE
flags |= orth.is_title(string) << IS_TITLE
flags |= orth.is_upper(string) << IS_UPPER
flags |= orth.like_url(string) << LIKE_URL
flags |= orth.like_number(string) << LIKE_NUMBER
return flags
cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
StringStore store, dict props) except *:
StringStore string_store, dict props) except *:
cdef Lexeme lex
lex.id = i
lex.length = len(string)
lex.sic = get_string_id(string, store)
lex.sic = string_store[string]
lex.cluster = props.get('cluster', 0)
lex.postype = props.get('postype', 0)
lex.supersense = props.get('supersense', 0)
lex.pos_type = props.get('pos_type', 0)
lex.sense_type = props.get('sense_type', 0)
lex.prob = props.get('prob', 0)
cdef float upper_pc = props.get('upper_pc', 0.0)
cdef float lower_pc = props.get('lower_pc', 0.0)
cdef float title_pc = props.get('title_pc', 0.0)
lex.upper_pc = props.get('upper_pc', 0.0)
lex.title_pc = props.get('lower_pc', 0.0)
lex.prefix = get_string_id(string[0], store)
lex.suffix = get_string_id(string[-3:], store)
if upper_pc or lower_pc or title_pc:
canon_cased = orth.canon_case(string, upper_pc, title_pc, lower_pc)
lex.norm = get_string_id(canon_cased, store)
else:
lex.norm = lex.sic
lex.shape = get_string_id(orth.word_shape(string), store)
lex.asciied = get_string_id(orth.asciied(string), store)
lex.flags = get_flags(string, upper_pc, title_pc, lower_pc)
lex.flags |= props.get('in_males', 0) << IN_MALES
lex.flags |= props.get('in_females', 0) << IN_FEMALES
lex.flags |= props.get('in_surnames', 0) << IN_SURNAMES
lex.flags |= props.get('in_places', 0) << IN_PLACES
lex.flags |= props.get('in_celebs', 0) << IN_CELEBS
lex.flags |= props.get('in_games', 0) << IN_GAMES
lex.flags |= props.get('in_names', 0) << IN_NAMES
lex.prefix = string_store[string[:1]]
lex.suffix = string_store[string[-3:]]
lex.norm = lex.sic # TODO
lex.shape = string_store[orth.word_shape(string)]
lex.asciied = string_store[orth.asciied(string)]
lex.flags = props.get('flags', 0)
return lex
cdef id_t get_string_id(unicode string, StringStore store) except 0:
cdef bytes byte_string = string.encode('utf8')
cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string))
return orig_str.i
cdef attr_t get_attr(Lexeme* lex, attr_id_t feat_name):
if feat_name < (sizeof(flags_t) * 8):
return check_flag(lex, feat_name)
elif feat_name == ID:
return lex.id
elif feat_name == SIC:
return lex.sic
elif feat_name == NORM:
return lex.norm
elif feat_name == SHAPE:
return lex.shape
elif feat_name == ASCIIED:
return lex.asciied
elif feat_name == PREFIX:
return lex.prefix
elif feat_name == SUFFIX:
return lex.suffix
elif feat_name == LENGTH:
return lex.length
elif feat_name == CLUSTER:
return lex.cluster
elif feat_name == POS_TYPE:
return lex.pos_type
elif feat_name == SENSE_TYPE:
return lex.sense_type
else:
raise StandardError('Feature ID: %d not found' % feat_name)