diff --git a/spacy/en.pxd b/spacy/en.pxd index a7c643eba..cccfb60a8 100644 --- a/spacy/en.pxd +++ b/spacy/en.pxd @@ -1,6 +1,32 @@ from spacy.lang cimport Language from spacy.tokens cimport Tokens +# Flags +cpdef enum FlagID: + IS_ALPHA + IS_ASCII + IS_DIGIT + IS_LOWER + IS_PUNCT + IS_SPACE + IS_TITLE + IS_UPPER + + LIKE_URL + LIKE_NUMBER + + OFT_LOWER + OFT_TITLE + OFT_UPPER + + IN_MALES + IN_FEMALES + IN_SURNAMES + IN_PLACES + IN_GAMES + IN_CELEBS + IN_NAMES + cdef class English(Language): pass diff --git a/spacy/en.pyx b/spacy/en.pyx index 95c1cbd94..92be97aad 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -38,6 +38,8 @@ provides a fully Penn Treebank 3-compliant tokenizer. from __future__ import unicode_literals cimport lang +from .typedefs cimport flags_t +import orth cdef class English(Language): @@ -47,7 +49,20 @@ cdef class English(Language): name (unicode): The two letter code used by Wikipedia for the language. lexicon (Lexicon): The lexicon. Exposes the lookup method. """ - pass + def set_flags(self, unicode string): + cdef flags_t flags = 0 + flags |= orth.is_alpha(string) << IS_ALPHA + flags |= orth.is_ascii(string) << IS_ASCII + flags |= orth.is_digit(string) << IS_DIGIT + flags |= orth.is_lower(string) << IS_LOWER + flags |= orth.is_punct(string) << IS_PUNCT + flags |= orth.is_space(string) << IS_SPACE + flags |= orth.is_title(string) << IS_TITLE + flags |= orth.is_upper(string) << IS_UPPER + + flags |= orth.like_url(string) << LIKE_URL + flags |= orth.like_number(string) << LIKE_NUMBER + return flags EN = English('en') diff --git a/spacy/lang.pxd b/spacy/lang.pxd index dc3262771..9e4bc7b5d 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -8,23 +8,17 @@ from cymem.cymem cimport Pool from .typedefs cimport hash_t from .tokens cimport Tokens from .lexeme cimport Lexeme -from .tagger cimport Tagger -from .utf8string cimport StringStore - - -cdef struct String: - Py_UNICODE* chars - size_t n - hash_t key +from .utf8string cimport StringStore, UniStr cdef class Lexicon: + cpdef public set_flags cdef Pool mem cpdef readonly size_t size cpdef readonly StringStore strings cdef vector[Lexeme*] lexemes - cdef Lexeme* get(self, String* s) except NULL + cdef Lexeme* get(self, UniStr* s) except NULL cdef PreshMap _map @@ -43,10 +37,10 @@ cdef class Language: cpdef Tokens tokens_from_list(self, list strings) cpdef Tokens tokenize(self, unicode text) - cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1 - cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes, + cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1 + cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except NULL - cdef int _attach_tokens(self, Tokens tokens, int idx, String* string, + cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string, vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except -1 cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1 diff --git a/spacy/lang.pyx b/spacy/lang.pyx index df9cf3166..2a284b9df 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -19,6 +19,8 @@ from .lexeme cimport Lexeme from .lexeme cimport EMPTY_LEXEME from .lexeme cimport init as lexeme_init +from .utf8string cimport slice_unicode + from . import util from .util import read_lang_data from .tokens import Tokens @@ -34,7 +36,7 @@ cdef class Language: self._prefix_re = re.compile(prefix) self._suffix_re = re.compile(suffix) self._infix_re = re.compile(infix) - self.lexicon = Lexicon() + self.lexicon = Lexicon(self.set_flags) if path.exists(path.join(util.DATA_DIR, name, 'lexemes')): self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes')) self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings')) @@ -45,11 +47,11 @@ cdef class Language: cdef Tokens tokens = Tokens(self.lexicon.strings, length) if length == 0: return tokens - cdef String string_struct + cdef UniStr string_struct cdef unicode py_string cdef int idx = 0 for i, py_string in enumerate(strings): - string_from_unicode(&string_struct, py_string) + slice_unicode(&string_struct, py_string, 0, len(py_string)) tokens.push_back(idx, self.lexicon.get(&string_struct)) idx += len(py_string) + 1 return tokens @@ -77,11 +79,11 @@ cdef class Language: cdef int start = 0 cdef Py_UNICODE* chars = string cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0]) - cdef String span + cdef UniStr span for i in range(1, length): if Py_UNICODE_ISSPACE(chars[i]) != in_ws: if start < i: - string_slice(&span, chars, start, i) + slice_unicode(&span, chars, start, i) lexemes = self._cache.get(span.key) if lexemes != NULL: tokens.extend(start, lexemes, 0) @@ -93,7 +95,7 @@ cdef class Language: start += 1 i += 1 if start < i: - string_slice(&span, chars, start, i) + slice_unicode(&span, chars, start, i) lexemes = self._cache.get(span.key) if lexemes != NULL: tokens.extend(start, lexemes, 0) @@ -101,7 +103,7 @@ cdef class Language: self._tokenize(tokens, &span, start, i) return tokens - cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1: + cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1: cdef vector[Lexeme*] prefixes cdef vector[Lexeme*] suffixes cdef hash_t orig_key @@ -112,20 +114,20 @@ cdef class Language: self._attach_tokens(tokens, start, span, &prefixes, &suffixes) self._save_cached(&tokens.lex[orig_size], orig_key, tokens.length - orig_size) - cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes, + cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except NULL: cdef size_t i - cdef String prefix - cdef String suffix - cdef String minus_pre - cdef String minus_suf + cdef UniStr prefix + cdef UniStr suffix + cdef UniStr minus_pre + cdef UniStr minus_suf cdef size_t last_size = 0 while string.n != 0 and string.n != last_size: last_size = string.n pre_len = self._find_prefix(string.chars, string.n) if pre_len != 0: - string_slice(&prefix, string.chars, 0, pre_len) - string_slice(&minus_pre, string.chars, pre_len, string.n) + slice_unicode(&prefix, string.chars, 0, pre_len) + slice_unicode(&minus_pre, string.chars, pre_len, string.n) # Check whether we've hit a special-case if minus_pre.n >= 1 and self._specials.get(minus_pre.key) != NULL: string[0] = minus_pre @@ -133,15 +135,15 @@ cdef class Language: break suf_len = self._find_suffix(string.chars, string.n) if suf_len != 0: - string_slice(&suffix, string.chars, string.n - suf_len, string.n) - string_slice(&minus_suf, string.chars, 0, string.n - suf_len) + slice_unicode(&suffix, string.chars, string.n - suf_len, string.n) + slice_unicode(&minus_suf, string.chars, 0, string.n - suf_len) # Check whether we've hit a special-case if minus_suf.n >= 1 and self._specials.get(minus_suf.key) != NULL: string[0] = minus_suf suffixes.push_back(self.lexicon.get(&suffix)) break if pre_len and suf_len and (pre_len + suf_len) <= string.n: - string_slice(string, string.chars, pre_len, string.n - suf_len) + slice_unicode(string, string.chars, pre_len, string.n - suf_len) prefixes.push_back(self.lexicon.get(&prefix)) suffixes.push_back(self.lexicon.get(&suffix)) elif pre_len: @@ -155,13 +157,13 @@ cdef class Language: return string cdef int _attach_tokens(self, Tokens tokens, - int idx, String* string, + int idx, UniStr* string, vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except -1: cdef int split cdef Lexeme** lexemes cdef Lexeme* lexeme - cdef String span + cdef UniStr span if prefixes.size(): idx = tokens.extend(idx, prefixes.data(), prefixes.size()) if string.n != 0: @@ -174,11 +176,11 @@ cdef class Language: if split == 0 or split == -1: idx = tokens.push_back(idx, self.lexicon.get(string)) else: - string_slice(&span, string.chars, 0, split) + slice_unicode(&span, string.chars, 0, split) idx = tokens.push_back(idx, self.lexicon.get(&span)) - string_slice(&span, string.chars, split, split+1) + slice_unicode(&span, string.chars, split, split+1) idx = tokens.push_back(idx, self.lexicon.get(&span)) - string_slice(&span, string.chars, split + 1, string.n) + slice_unicode(&span, string.chars, split + 1, string.n) idx = tokens.push_back(idx, self.lexicon.get(&span)) cdef vector[Lexeme*].reverse_iterator it = suffixes.rbegin() while it != suffixes.rend(): @@ -222,14 +224,14 @@ cdef class Language: ''' cdef Lexeme** lexemes cdef hash_t hashed - cdef String string + cdef UniStr string for uni_string, substrings in token_rules: lexemes = self.mem.alloc(len(substrings) + 1, sizeof(Lexeme*)) for i, substring in enumerate(substrings): - string_from_unicode(&string, substring) + slice_unicode(&string, substring, 0, len(substring)) lexemes[i] = self.lexicon.get(&string) lexemes[i + 1] = NULL - string_from_unicode(&string, uni_string) + slice_unicode(&string, uni_string, 0, len(uni_string)) self._specials.set(string.key, lexemes) self._cache.set(string.key, lexemes) @@ -239,21 +241,23 @@ cdef class Lexicon: Also interns UTF-8 strings, and maps them to consecutive integer IDs. ''' - def __init__(self): + def __init__(self, object set_flags=None): self.mem = Pool() self._map = PreshMap(2 ** 20) self.strings = StringStore() self.lexemes.push_back(&EMPTY_LEXEME) self.size = 1 + self.set_flags = set_flags - cdef Lexeme* get(self, String* string) except NULL: + cdef Lexeme* get(self, UniStr* string) except NULL: '''Retrieve a pointer to a Lexeme from the lexicon.''' cdef Lexeme* lex lex = self._map.get(string.key) if lex != NULL: return lex lex = self.mem.alloc(sizeof(Lexeme), 1) - lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key, self.strings, {}) + lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key, + self.strings, {'flags': self.set_flags(string.chars[:string.n])}) self._map.set(string.key, lex) while self.lexemes.size() < (lex.id + 1): self.lexemes.push_back(&EMPTY_LEXEME) @@ -283,14 +287,14 @@ cdef class Lexicon: ''' if type(id_or_string) == int: return self.lexemes.at(id_or_string)[0] - cdef String string - string_from_unicode(&string, id_or_string) + cdef UniStr string + slice_unicode(&string, id_or_string, 0, len(id_or_string)) cdef Lexeme* lexeme = self.get(&string) return lexeme[0] def __setitem__(self, unicode uni_string, dict props): - cdef String s - string_from_unicode(&s, uni_string) + cdef UniStr s + slice_unicode(&s, uni_string, 0, len(uni_string)) cdef Lexeme* lex = self.get(&s) lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props) @@ -338,14 +342,3 @@ cdef class Lexicon: i += 1 self.size += 1 fclose(fp) - - -cdef void string_from_unicode(String* s, unicode uni): - cdef Py_UNICODE* c_uni = uni - string_slice(s, c_uni, 0, len(uni)) - - -cdef inline void string_slice(String* s, Py_UNICODE* chars, int start, int end) nogil: - s.chars = &chars[start] - s.n = end - start - s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0) diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 0d7d206e5..9d5dddd6d 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -1,61 +1,119 @@ -from .typedefs cimport hash_t, utf8_t, flag_t, id_t, len_t, tag_t +from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t from .utf8string cimport StringStore -from libc.stdint cimport uint16_t -cpdef flag_t OOV_DIST_FLAGS -# Flags -cpdef enum: - IS_ALPHA - IS_ASCII - IS_DIGIT - IS_LOWER - IS_PUNCT - IS_SPACE - IS_TITLE - IS_UPPER +# Reserve 64 values for flag features +cpdef enum attr_id_t: + FLAG0 + FLAG1 + FLAG2 + FLAG3 + FLAG4 + FLAG5 + FLAG6 + FLAG7 + FLAG8 + FLAG9 + FLAG10 + FLAG11 + FLAG12 + FLAG13 + FLAG14 + FLAG15 + FLAG16 + FLAG17 + FLAG18 + FLAG19 + FLAG20 + FLAG21 + FLAG22 + FLAG23 + FLAG24 + FLAG25 + FLAG26 + FLAG27 + FLAG28 + FLAG29 + FLAG30 + FLAG31 + FLAG32 + FLAG33 + FLAG34 + FLAG35 + FLAG36 + FLAG37 + FLAG38 + FLAG39 + FLAG40 + FLAG41 + FLAG42 + FLAG43 + FLAG44 + FLAG45 + FLAG46 + FLAG47 + FLAG48 + FLAG49 + FLAG50 + FLAG51 + FLAG52 + FLAG53 + FLAG54 + FLAG55 + FLAG56 + FLAG57 + FLAG58 + FLAG59 + FLAG60 + FLAG61 + FLAG62 + FLAG63 - LIKE_URL - LIKE_NUMBER + ID + SIC + NORM + SHAPE + ASCIIED + PREFIX + SUFFIX - OFT_LOWER - OFT_TITLE - OFT_UPPER - - IN_MALES - IN_FEMALES - IN_SURNAMES - IN_PLACES - IN_GAMES - IN_CELEBS - IN_NAMES + LENGTH + CLUSTER + POS_TYPE + SENSE_TYPE cdef struct Lexeme: - flag_t flags + flags_t flags - id_t id - id_t sic - id_t norm - id_t shape - id_t asciied - id_t prefix - id_t suffix + attr_t id + attr_t sic + attr_t norm + attr_t shape + attr_t asciied + attr_t prefix + attr_t suffix + + attr_t length + attr_t cluster + attr_t pos_type + attr_t sense_type float prob - - len_t length - tag_t cluster - tag_t postype - tag_t supersense + float upper_pc + float title_pc cdef Lexeme EMPTY_LEXEME -cpdef Lexeme init(id_t i, unicode string, hash_t hashed, - StringStore store, dict props) except * + +cpdef Lexeme init(id_t i, unicode string, hash_t hashed, StringStore store, + dict props) except * -cdef inline bint check_flag(Lexeme* lexeme, size_t flag_id) nogil: +cdef inline bint check_flag(Lexeme* lexeme, attr_id_t flag_id) nogil: return lexeme.flags & (1 << flag_id) + + +cdef attr_t get_attr(Lexeme* lex, attr_id_t attr_id) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 64eb699a6..888edc07b 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -6,67 +6,59 @@ from libc.string cimport memset import orth -from .utf8string cimport Utf8Str - -OOV_DIST_FLAGS = 0 memset(&EMPTY_LEXEME, 0, sizeof(Lexeme)) -def get_flags(unicode string, float upper_pc, float title_pc, float lower_pc): - cdef flag_t flags = 0 - flags |= orth.is_alpha(string) << IS_ALPHA - flags |= orth.is_ascii(string) << IS_ASCII - flags |= orth.is_digit(string) << IS_DIGIT - flags |= orth.is_lower(string) << IS_LOWER - flags |= orth.is_punct(string) << IS_PUNCT - flags |= orth.is_space(string) << IS_SPACE - flags |= orth.is_title(string) << IS_TITLE - flags |= orth.is_upper(string) << IS_UPPER - - flags |= orth.like_url(string) << LIKE_URL - flags |= orth.like_number(string) << LIKE_NUMBER - return flags - - cpdef Lexeme init(id_t i, unicode string, hash_t hashed, - StringStore store, dict props) except *: + StringStore string_store, dict props) except *: cdef Lexeme lex lex.id = i lex.length = len(string) - lex.sic = get_string_id(string, store) + lex.sic = string_store[string] lex.cluster = props.get('cluster', 0) - lex.postype = props.get('postype', 0) - lex.supersense = props.get('supersense', 0) + lex.pos_type = props.get('pos_type', 0) + lex.sense_type = props.get('sense_type', 0) lex.prob = props.get('prob', 0) - cdef float upper_pc = props.get('upper_pc', 0.0) - cdef float lower_pc = props.get('lower_pc', 0.0) - cdef float title_pc = props.get('title_pc', 0.0) + lex.upper_pc = props.get('upper_pc', 0.0) + lex.title_pc = props.get('lower_pc', 0.0) - lex.prefix = get_string_id(string[0], store) - lex.suffix = get_string_id(string[-3:], store) - if upper_pc or lower_pc or title_pc: - canon_cased = orth.canon_case(string, upper_pc, title_pc, lower_pc) - lex.norm = get_string_id(canon_cased, store) - else: - lex.norm = lex.sic - lex.shape = get_string_id(orth.word_shape(string), store) - lex.asciied = get_string_id(orth.asciied(string), store) - lex.flags = get_flags(string, upper_pc, title_pc, lower_pc) - - lex.flags |= props.get('in_males', 0) << IN_MALES - lex.flags |= props.get('in_females', 0) << IN_FEMALES - lex.flags |= props.get('in_surnames', 0) << IN_SURNAMES - lex.flags |= props.get('in_places', 0) << IN_PLACES - lex.flags |= props.get('in_celebs', 0) << IN_CELEBS - lex.flags |= props.get('in_games', 0) << IN_GAMES - lex.flags |= props.get('in_names', 0) << IN_NAMES + lex.prefix = string_store[string[:1]] + lex.suffix = string_store[string[-3:]] + lex.norm = lex.sic # TODO + lex.shape = string_store[orth.word_shape(string)] + lex.asciied = string_store[orth.asciied(string)] + + lex.flags = props.get('flags', 0) return lex -cdef id_t get_string_id(unicode string, StringStore store) except 0: - cdef bytes byte_string = string.encode('utf8') - cdef Utf8Str* orig_str = store.intern(byte_string, len(byte_string)) - return orig_str.i +cdef attr_t get_attr(Lexeme* lex, attr_id_t feat_name): + if feat_name < (sizeof(flags_t) * 8): + return check_flag(lex, feat_name) + elif feat_name == ID: + return lex.id + elif feat_name == SIC: + return lex.sic + elif feat_name == NORM: + return lex.norm + elif feat_name == SHAPE: + return lex.shape + elif feat_name == ASCIIED: + return lex.asciied + elif feat_name == PREFIX: + return lex.prefix + elif feat_name == SUFFIX: + return lex.suffix + elif feat_name == LENGTH: + return lex.length + elif feat_name == CLUSTER: + return lex.cluster + elif feat_name == POS_TYPE: + return lex.pos_type + elif feat_name == SENSE_TYPE: + return lex.sense_type + else: + raise StandardError('Feature ID: %d not found' % feat_name)