diff --git a/spacy/en.pyx b/spacy/en.pyx index f90af1549..4357addfe 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -64,3 +64,33 @@ cpdef Lexeme_addr lookup(unicode string) except 0: cpdef unicode unhash(StringHash hash_value): return EN.unhash(hash_value) + + +cpdef bint is_oft_upper(size_t lex_id): + '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which + stores whether the lowered version of the string hashed by `lex' is found + in all-upper case frequently in a large sample of text. Users are free + to load different data, by default we use a sample from Wikipedia, with + a threshold of 0.95, picked to maximize mutual information for POS tagging. + + >>> is_oft_upper(lookup(u'abc')) + True + >>> is_oft_upper(lookup(u'aBc')) # This must get the same answer + True + ''' + return (lex_id).dist.flags & OFT_UPPER + + +cpdef bint is_oft_title(size_t lex_id): + '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which + stores whether the lowered version of the string hashed by `lex' is found + title-cased frequently in a large sample of text. Users are free + to load different data, by default we use a sample from Wikipedia, with + a threshold of 0.3, picked to maximize mutual information for POS tagging. + + >>> is_oft_title(lookup(u'marcus')) + True + >>> is_oft_title(lookup(u'MARCUS')) # This must get the same value + True + ''' + return (lex_id).dist.flags & OFT_TITLE diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 90d06587e..6175ec3a8 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -9,13 +9,35 @@ ctypedef char Bits8 ctypedef uint64_t Bits64 +cdef enum OrthFlag: + IS_ALPHA + IS_DIGIT + IS_PUNCT + IS_WHITE + IS_LOWER + IS_UPPER + IS_TITLE + IS_ASCII + + +cdef enum DistFlag: + OFT_UPPER + OFT_TITLE + DIST_FLAG3 + DIST_FLAG4 + DIST_FLAG5 + DIST_FLAG6 + DIST_FLAG7 + DIST_FLAG8 + + cdef struct Orthography: StringHash last3 StringHash shape StringHash norm size_t length - unsigned char first + Py_UNICODE first Bits8 flags @@ -27,15 +49,12 @@ cdef struct Distribution: cdef struct Lexeme: - StringHash sic # Hash of the original string - StringHash lex # Hash of the word, with punctuation and clitics split off - - Distribution* dist # Distribution info, lazy loaded + StringHash lex # Hash of the word Orthography* orth # Extra orthographic views - #Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens + Distribution* dist # Distribution info -cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL) +cdef Lexeme BLANK_WORD = Lexeme(0, NULL, NULL) cdef enum StringAttr: @@ -49,7 +68,16 @@ cdef enum StringAttr: cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0 cpdef StringHash lex_of(size_t lex_id) except 0 + cpdef StringHash norm_of(size_t lex_id) except 0 cpdef StringHash shape_of(size_t lex_id) except 0 cpdef StringHash last3_of(size_t lex_id) except 0 -cpdef StringHash length_of(size_t lex_id) + +cpdef size_t length_of(size_t lex_id) except * +cpdef Py_UNICODE first_of(size_t lex_id) except * + +cpdef double prob_of(size_t lex_id) except 0 +cpdef ClusterID cluster_of(size_t lex_id) except 0 + +cpdef bint check_orth_flag(size_t lex, OrthFlag flag) except * +cpdef bint check_dist_flag(size_t lex, DistFlag flag) except * diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 42c93ec60..27ca13bd7 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -13,13 +13,6 @@ from libcpp.vector cimport vector from spacy.spacy cimport StringHash -# Reiterate the enum, for python -#SIC = StringAttr.sic -#LEX = StringAttr.lex -#NORM = StringAttr.norm -#SHAPE = StringAttr.shape -#LAST3 = StringAttr.last3 - cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0: if attr == LEX: @@ -133,37 +126,9 @@ cpdef double prob_of(size_t lex_id): return (lex_id).dist.prob -cpdef bint is_oft_upper(size_t lex_id): - '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which - stores whether the lowered version of the string hashed by `lex' is found - in all-upper case frequently in a large sample of text. Users are free - to load different data, by default we use a sample from Wikipedia, with - a threshold of 0.95, picked to maximize mutual information for POS tagging. - - >>> is_oft_upper(lookup(u'abc')) - True - >>> is_oft_upper(lookup(u'aBc')) # This must get the same answer - True - ''' - return False - #cdef Lexeme* w = lex_id - #return w.orth.last3 if w.orth != NULL else 0 +cpdef bint check_orth_flag(size_t lex, OrthFlag flag) except *: + return (lex_id).orth.flags & (1 << flag) - #return (lex_id).oft_upper - - -cpdef bint is_oft_title(size_t lex_id): - '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which - stores whether the lowered version of the string hashed by `lex' is found - title-cased frequently in a large sample of text. Users are free - to load different data, by default we use a sample from Wikipedia, with - a threshold of 0.3, picked to maximize mutual information for POS tagging. - - >>> is_oft_title(lookup(u'marcus')) - True - >>> is_oft_title(lookup(u'MARCUS')) # This must get the same value - True - ''' - return False - #return (lex_id).oft_title +cpdef bint check_dist_flag(size_t lex, DistFlag flag) except *: + return (lex_id).dist.flags & (1 << flag) diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx index befa82cc7..e4c5cf240 100644 --- a/spacy/spacy.pyx +++ b/spacy/spacy.pyx @@ -60,6 +60,7 @@ cdef class Language: self.chunks.set_empty_key(0) self.vocab.set_empty_key(0) self.load_tokenization(util.read_tokenization(name)) + self.load_dist_info(util.read_dist_info(name)) cdef Tokens tokenize(self, unicode string): cdef Lexeme** chunk @@ -108,7 +109,8 @@ cdef class Language: word.lex = hash(string) self.bacov[word.lex] = string word.orth = self.new_orth(string) - word.dist = self.new_dist(string) + + word.dist = calloc(1, sizeof(Distribution)) self.vocab[word.lex] = word return word @@ -135,13 +137,8 @@ cdef class Language: self.bacov[orth.last3] = last3 self.bacov[orth.norm] = norm self.bacov[orth.shape] = shape - return orth - cdef Distribution* new_dist(self, unicode lex) except NULL: - dist = calloc(1, sizeof(Distribution)) - return dist - cdef unicode unhash(self, StringHash hash_value): '''Fetch a string from the reverse index, given its hash value.''' return self.bacov[hash_value] @@ -164,21 +161,18 @@ cdef class Language: for chunk, tokens in token_rules: self.new_chunk(chunk, tokens) - def load_clusters(self): + def load_dist_info(self, dist_info): + cdef unicode string + cdef dict word_dist cdef Lexeme* w - data_dir = path.join(path.dirname(__file__), '..', 'data', 'en') - case_stats = util.load_case_stats(data_dir) - brown_loc = path.join(data_dir, 'clusters') - cdef size_t start - cdef int end - with util.utf8open(brown_loc) as browns_file: - for i, line in enumerate(browns_file): - cluster_str, token_string, freq_str = line.split() - # Decode as a little-endian string, so that we can do & 15 to get - # the first 4 bits. See redshift._parse_features.pyx - cluster = int(cluster_str[::-1], 2) - upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0)) - self.new_lexeme(token_string) + for string, word_dist in dist_info.items(): + w = self.lookup(string) + w.prob = word_dist.prob + w.cluster = word_dist.cluster + for flag in word_dist.flags: + w.flags |= lexeme.DIST_FLAGS[flag] + for tag in word_dist.tagdict: + w.tagdict |= lexeme.TAGS[tag] cdef inline bint _is_whitespace(Py_UNICODE c) nogil: diff --git a/spacy/util.py b/spacy/util.py index 4d12014ca..c9bce0171 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -19,6 +19,12 @@ def load_case_stats(data_dir): return case_stats +def load_dist_info(lang): + with path.join(DATA_DIR, lang, 'distribution_info.json') as file_: + dist_info = json.load(file_) + return dist_info + + def read_tokenization(lang): loc = path.join(DATA_DIR, lang, 'tokenization') entries = []