From 5fddb8d16590b549161e84b256a20e694e090291 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 19 Aug 2014 04:21:20 +0200 Subject: [PATCH] * Working refactor, with updated data model for Lexemes --- spacy/en.pyx | 28 --------------------- spacy/lexeme.pxd | 20 +++++++-------- spacy/lexeme.pyx | 45 ++++++++++++++++++++++++++-------- spacy/spacy.pxd | 3 --- spacy/spacy.pyx | 64 ++++++++++++++++++++---------------------------- 5 files changed, 72 insertions(+), 88 deletions(-) diff --git a/spacy/en.pyx b/spacy/en.pyx index 4357addfe..3b7d506fa 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -66,31 +66,3 @@ cpdef unicode unhash(StringHash hash_value): return EN.unhash(hash_value) -cpdef bint is_oft_upper(size_t lex_id): - '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which - stores whether the lowered version of the string hashed by `lex' is found - in all-upper case frequently in a large sample of text. Users are free - to load different data, by default we use a sample from Wikipedia, with - a threshold of 0.95, picked to maximize mutual information for POS tagging. - - >>> is_oft_upper(lookup(u'abc')) - True - >>> is_oft_upper(lookup(u'aBc')) # This must get the same answer - True - ''' - return (lex_id).dist.flags & OFT_UPPER - - -cpdef bint is_oft_title(size_t lex_id): - '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which - stores whether the lowered version of the string hashed by `lex' is found - title-cased frequently in a large sample of text. Users are free - to load different data, by default we use a sample from Wikipedia, with - a threshold of 0.3, picked to maximize mutual information for POS tagging. - - >>> is_oft_title(lookup(u'marcus')) - True - >>> is_oft_title(lookup(u'MARCUS')) # This must get the same value - True - ''' - return (lex_id).dist.flags & OFT_TITLE diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 6175ec3a8..6d944eb25 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -32,12 +32,9 @@ cdef enum DistFlag: cdef struct Orthography: - StringHash last3 StringHash shape StringHash norm - - size_t length - Py_UNICODE first + StringHash last3 Bits8 flags @@ -49,12 +46,17 @@ cdef struct Distribution: cdef struct Lexeme: - StringHash lex # Hash of the word - Orthography* orth # Extra orthographic views - Distribution* dist # Distribution info + char* string + size_t length + StringHash lex + Orthography orth # Extra orthographic views + Distribution dist # Distribution info -cdef Lexeme BLANK_WORD = Lexeme(0, NULL, NULL) +cdef Lexeme BLANK_WORD = Lexeme(NULL, 0, 0, + Orthography(0, 0, 0, 0), + Distribution(0.0, 0, 0, 0) +) cdef enum StringAttr: @@ -68,13 +70,11 @@ cdef enum StringAttr: cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0 cpdef StringHash lex_of(size_t lex_id) except 0 - cpdef StringHash norm_of(size_t lex_id) except 0 cpdef StringHash shape_of(size_t lex_id) except 0 cpdef StringHash last3_of(size_t lex_id) except 0 cpdef size_t length_of(size_t lex_id) except * -cpdef Py_UNICODE first_of(size_t lex_id) except * cpdef double prob_of(size_t lex_id) except 0 cpdef ClusterID cluster_of(size_t lex_id) except 0 diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 27ca13bd7..37392637b 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -72,7 +72,7 @@ cpdef StringHash last3_of(size_t lex_id) except 0: return (lex_id).orth.last3 -cpdef ClusterID cluster_of(size_t lex_id): +cpdef ClusterID cluster_of(size_t lex_id) except 0: '''Access the `cluster' field of the Lexeme pointed to by lex_id, which gives an integer representation of the cluster ID of the word, which should be understood as a binary address: @@ -99,21 +99,17 @@ cpdef Py_UNICODE first_of(size_t lex_id): >>> unhash(first_of(lex_id)) u'H' ''' - if (lex_id).orth == NULL: - return 0 return (lex_id).orth.first -cpdef StringHash length_of(size_t lex_id): +cpdef size_t length_of(size_t lex_id) except *: '''Access the `length' field of the Lexeme pointed to by lex_id, which stores the length of the string hashed by lex_of.''' cdef Lexeme* word = lex_id - if (lex_id).orth == NULL: - return 0 - return (lex_id).orth.length + return word.length -cpdef double prob_of(size_t lex_id): +cpdef double prob_of(size_t lex_id) except 0: '''Access the `prob' field of the Lexeme pointed to by lex_id, which stores the smoothed unigram log probability of the word, as estimated from a large text corpus. By default, probabilities are based on counts from Gigaword, @@ -126,9 +122,38 @@ cpdef double prob_of(size_t lex_id): return (lex_id).dist.prob -cpdef bint check_orth_flag(size_t lex, OrthFlag flag) except *: +cpdef bint is_oft_upper(size_t lex_id): + '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which + stores whether the lowered version of the string hashed by `lex' is found + in all-upper case frequently in a large sample of text. Users are free + to load different data, by default we use a sample from Wikipedia, with + a threshold of 0.95, picked to maximize mutual information for POS tagging. + + >>> is_oft_upper(lookup(u'abc')) + True + >>> is_oft_upper(lookup(u'aBc')) # This must get the same answer + True + ''' + return (lex_id).dist.flags & OFT_UPPER + + +cpdef bint is_oft_title(size_t lex_id): + '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which + stores whether the lowered version of the string hashed by `lex' is found + title-cased frequently in a large sample of text. Users are free + to load different data, by default we use a sample from Wikipedia, with + a threshold of 0.3, picked to maximize mutual information for POS tagging. + + >>> is_oft_title(lookup(u'marcus')) + True + >>> is_oft_title(lookup(u'MARCUS')) # This must get the same value + True + ''' + return (lex_id).dist.flags & OFT_TITLE + +cpdef bint check_orth_flag(size_t lex_id, OrthFlag flag) except *: return (lex_id).orth.flags & (1 << flag) -cpdef bint check_dist_flag(size_t lex, DistFlag flag) except *: +cpdef bint check_dist_flag(size_t lex_id, DistFlag flag) except *: return (lex_id).dist.flags & (1 << flag) diff --git a/spacy/spacy.pxd b/spacy/spacy.pxd index 344b3577c..b9caac34f 100644 --- a/spacy/spacy.pxd +++ b/spacy/spacy.pxd @@ -21,7 +21,6 @@ ctypedef int ClusterID from spacy.lexeme cimport Lexeme from spacy.lexeme cimport Distribution from spacy.lexeme cimport Orthography -from spacy._hashing cimport WordTree cdef class Language: @@ -37,8 +36,6 @@ cdef class Language: cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL cdef Lexeme* new_lexeme(self, unicode lex) except NULL - cdef Orthography* new_orth(self, unicode lex) except NULL - cdef Distribution* new_dist(self, unicode lex) except NULL cdef unicode unhash(self, StringHash hashed) diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx index e4c5cf240..a8b4ebe74 100644 --- a/spacy/spacy.pyx +++ b/spacy/spacy.pyx @@ -13,15 +13,19 @@ from spacy.string_tools cimport substr from . import util from os import path +DIST_FLAGS = {} +TAGS = {} -def get_normalized(unicode lex, size_t length): + +def get_normalized(unicode lex): if lex.isalpha() and lex.islower(): return lex else: - return get_word_shape(lex, length) + return get_word_shape(lex) -def get_word_shape(unicode lex, length): +def get_word_shape(unicode lex): + cdef size_t length = len(lex) shape = "" last = "" shape_char = "" @@ -47,7 +51,7 @@ def get_word_shape(unicode lex, length): return shape -def set_orth_flags(lex, length): +def set_orth_flags(lex): return 0 @@ -60,7 +64,7 @@ cdef class Language: self.chunks.set_empty_key(0) self.vocab.set_empty_key(0) self.load_tokenization(util.read_tokenization(name)) - self.load_dist_info(util.read_dist_info(name)) + #self.load_dist_info(util.read_dist_info(name)) cdef Tokens tokenize(self, unicode string): cdef Lexeme** chunk @@ -106,39 +110,25 @@ cdef class Language: cdef Lexeme* new_lexeme(self, unicode string) except NULL: cdef Lexeme* word = calloc(1, sizeof(Lexeme)) + cdef bytes byte_string = string.encode('utf8') + word.string = byte_string + word.length = len(byte_string) + word.orth.flags = set_orth_flags(string) + cdef unicode norm = get_normalized(string) + cdef unicode shape = get_word_shape(string) + cdef unicode last3 = string[-3:] word.lex = hash(string) + word.orth.norm = hash(norm) + word.orth.shape = hash(shape) + word.orth.last3 = hash(last3) self.bacov[word.lex] = string - word.orth = self.new_orth(string) + self.bacov[word.orth.norm] = norm + self.bacov[word.orth.shape] = shape + self.bacov[word.orth.last3] = last3 - word.dist = calloc(1, sizeof(Distribution)) - self.vocab[word.lex] = word + self.vocab[hash(string)] = word return word - cdef Orthography* new_orth(self, unicode lex) except NULL: - cdef unicode last3 - cdef unicode norm - cdef unicode shape - cdef int length - - length = len(lex) - orth = calloc(1, sizeof(Orthography)) - orth.first = lex[0] - - orth.length = length - orth.flags = set_orth_flags(lex, orth.length) - orth.norm = hash(lex) - last3 = substr(lex, length - 3, length, length) - orth.last3 = hash(last3) - norm = get_normalized(lex, length) - orth.norm = hash(norm) - shape = get_word_shape(lex, length) - orth.shape = hash(shape) - - self.bacov[orth.last3] = last3 - self.bacov[orth.norm] = norm - self.bacov[orth.shape] = shape - return orth - cdef unicode unhash(self, StringHash hash_value): '''Fetch a string from the reverse index, given its hash value.''' return self.bacov[hash_value] @@ -167,12 +157,12 @@ cdef class Language: cdef Lexeme* w for string, word_dist in dist_info.items(): w = self.lookup(string) - w.prob = word_dist.prob - w.cluster = word_dist.cluster + w.dist.prob = word_dist.prob + w.dist.cluster = word_dist.cluster for flag in word_dist.flags: - w.flags |= lexeme.DIST_FLAGS[flag] + w.dist.flags |= DIST_FLAGS[flag] for tag in word_dist.tagdict: - w.tagdict |= lexeme.TAGS[tag] + w.dist.tagdict |= TAGS[tag] cdef inline bint _is_whitespace(Py_UNICODE c) nogil: