diff --git a/spacy/en.pxd b/spacy/en.pxd index 23e11d1dc..7e99cdd43 100644 --- a/spacy/en.pxd +++ b/spacy/en.pxd @@ -3,14 +3,13 @@ from libcpp.vector cimport vector from spacy.spacy cimport StringHash from spacy.spacy cimport Language -from spacy.word cimport Word -from spacy.tokens cimport Tokens +from spacy.word cimport LatinWord cimport cython cdef class English(spacy.Language): cdef int find_split(self, unicode word) - cdef int set_orth(self, unicode word, Word lex) except -1 + cdef LatinWord new_lexeme(self, unicode string) cdef English EN diff --git a/spacy/en.pyx b/spacy/en.pyx index 66394af4f..df783c35e 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -44,6 +44,9 @@ cimport spacy cdef class English(spacy.Language): + cdef LatinWord new_lexeme(self, unicode string): + return LatinWord(string) + cdef int find_split(self, unicode word): cdef size_t length = len(word) cdef int i = 0 diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd deleted file mode 100644 index be18350bb..000000000 --- a/spacy/lexeme.pxd +++ /dev/null @@ -1,40 +0,0 @@ -from libc.stdint cimport uint32_t -from libc.stdint cimport uint64_t -cimport cython - -ctypedef int ClusterID -ctypedef uint32_t StringHash -ctypedef size_t LexID -ctypedef char OrthFlags -ctypedef char DistFlags -ctypedef uint64_t TagFlags - - -cdef struct Lexeme: - StringHash lex - char* string - size_t length - double prob - ClusterID cluster - TagFlags possible_tags - DistFlags dist_flags - OrthFlags orth_flags - StringHash* string_views - - -cpdef StringHash lex_of(LexID lex_id) except 0 -cpdef char first_of(LexID lex_id) except 0 -cpdef size_t length_of(LexID lex_id) except 0 -cpdef double prob_of(LexID lex_id) except 1 -cpdef ClusterID cluster_of(LexID lex_id) except 0 - - -cpdef bint is_often_titled(size_t lex_id) -cpdef bint is_often_uppered(size_t lex_id) - - -cpdef bint can_tag(LexID lex, TagFlags flag) except * -cpdef bint check_dist_flag(LexID lex, DistFlags flag) except * -cpdef bint check_orth_flag(LexID lex, OrthFlags flag) except * - -cpdef StringHash view_of(LexID lex_id, size_t view) except 0 diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx deleted file mode 100644 index 66fac7a74..000000000 --- a/spacy/lexeme.pyx +++ /dev/null @@ -1,155 +0,0 @@ -# cython: profile=True -# cython: embedsignature=True -'''Accessors for Lexeme properties, given a lex_id, which is cast to a Lexeme*. -Mostly useful from Python-space. From Cython-space, you can just cast to -Lexeme* yourself. -''' -from __future__ import unicode_literals - -from libc.stdlib cimport malloc, calloc, free -from libc.stdint cimport uint64_t - -from spacy.spacy cimport StringHash - -# Python-visible enum for POS tags -PUNCT = 0 -CONJ = 1 -NUM = 2 -X = 3 -DET = 4 -ADP = 5 -ADJ = 6 -ADV = 7 -VERB = 8 -NOUN = 9 -PDT = 10 -POS = 11 -PRON = 12 -PRT = 13 - -cpdef int set_flags(LexID lex_id, object active_flags) except *: - """Set orthographic bit flags for a Lexeme. - - Args: - lex_id (LexemeID): A reference ID for a Lexeme. - active_flags: A sequence of bits to set as True. - """ - cdef size_t flag - cdef Lexeme* w = lex_id - for flag in active_flags: - w.orth_flags |= 1 << flag - - -cpdef StringHash view_of(LexID lex_id, size_t view) except 0: - return (lex_id).string_views[view] - - -cpdef StringHash lex_of(LexID lex_id) except 0: - '''Access a hash of the word's string. - - >>> lex_of(lookup(u'Hi')) == hash(u'Hi') - True - ''' - return (lex_id).lex - - -cpdef ClusterID cluster_of(LexID lex_id) except 0: - '''Access an integer representation of the word's Brown cluster. - - A Brown cluster is an address into a binary tree, which gives some (noisy) - information about the word's distributional context. - - >>> strings = (u'pineapple', u'apple', u'dapple', u'scalable') - >>> token_ids = [lookup(s) for s in strings] - >>> clusters = [cluster_of(t) for t in token_ids] - >>> print ["{0:b"} % cluster_of(t) for t in token_ids] - ["100111110110", "100111100100", "01010111011001", "100111110110"] - - The clusterings are unideal, but often slightly useful. - "pineapple" and "apple" share a long prefix, indicating a similar meaning, - while "dapple" is totally different. On the other hand, "scalable" receives - the same cluster ID as "pineapple", which is not what we'd like. - ''' - return (lex_id).cluster - - -cpdef char first_of(size_t lex_id) except 0: - '''Access the first byte of a utf8 encoding of the word. - - >>> lex_id = lookup(u'Hello') - >>> chr(first_of(lex_id)) - 'H' - ''' - return (lex_id).string[0] - - -cpdef size_t length_of(size_t lex_id) except 0: - '''Access the (unicode) length of the word. - ''' - cdef Lexeme* word = lex_id - return word.length - - -cpdef double prob_of(size_t lex_id) except 1: - '''Access an estimate of the word's unigram log probability. - - Probabilities are calculated from a large text corpus, and smoothed using - simple Good-Turing. Estimates are read from data/en/probabilities, and - can be replaced using spacy.en.load_probabilities. - - >>> prob_of(lookup(u'world')) - -20.10340371976182 - ''' - return (lex_id).prob - -DEF OFT_UPPER = 1 -DEF OFT_TITLE = 2 - -cpdef bint is_often_uppered(size_t lex_id): - '''Check the OFT_UPPER distributional flag for the word. - - The OFT_UPPER flag records whether a lower-cased version of the word - is found in all-upper case frequently in a large sample of text, where - "frequently" is defined as P >= 0.95 (chosen for high mutual information for - POS tagging). - - Case statistics are estimated from a large text corpus. Estimates are read - from data/en/case_stats, and can be replaced using spacy.en.load_case_stats. - - >>> is_often_uppered(lookup(u'nato')) - True - >>> is_often_uppered(lookup(u'the')) - False - ''' - return (lex_id).dist_flags & (1 << OFT_UPPER) - - -cpdef bint is_often_titled(size_t lex_id): - '''Check the OFT_TITLE distributional flag for the word. - - The OFT_TITLE flag records whether a lower-cased version of the word - is found title-cased (see string.istitle) frequently in a large sample of text, - where "frequently" is defined as P >= 0.3 (chosen for high mutual information for - POS tagging). - - Case statistics are estimated from a large text corpus. Estimates are read - from data/en/case_stats, and can be replaced using spacy.en.load_case_stats. - - >>> is_oft_upper(lookup(u'john')) - True - >>> is_oft_upper(lookup(u'Bill')) - False - ''' - return (lex_id).dist_flags & (1 << OFT_TITLE) - - -cpdef bint check_orth_flag(size_t lex_id, OrthFlags flag) except *: - return (lex_id).orth_flags & (1 << flag) - - -cpdef bint check_dist_flag(size_t lex_id, DistFlags flag) except *: - return (lex_id).dist_flags & (1 << flag) - - -cpdef bint can_tag(LexID lex_id, TagFlags flag) except *: - return (lex_id).possible_tags & (1 << flag) diff --git a/spacy/word.pxd b/spacy/word.pxd index 2e14397c5..d696d3727 100644 --- a/spacy/word.pxd +++ b/spacy/word.pxd @@ -29,18 +29,19 @@ cdef enum: cdef class Word: # NB: the readonly keyword refers to _Python_ access. The attributes are # writeable from Cython. - cdef readonly StringHash lex - cdef readonly char* string + cdef readonly StringHash key + cdef readonly char** utf8_strings cdef readonly size_t length cdef readonly double prob cdef readonly ClusterID cluster cdef readonly TagFlags possible_tags cdef readonly DistFlags dist_flags cdef readonly OrthFlags orth_flags - cdef StringHash* string_views cpdef StringHash get_view(self, size_t i) except 0 + +cdef class CasedWord(Word): cpdef bint can_tag(self, TagFlags flag) except * cpdef bint check_dist_flag(self, DistFlags flag) except * cpdef bint check_orth_flag(self, OrthFlags flag) except * diff --git a/spacy/word.pyx b/spacy/word.pyx index 0f5c46aa8..74f06b81f 100644 --- a/spacy/word.pyx +++ b/spacy/word.pyx @@ -60,11 +60,11 @@ cdef class Word: while "dapple" is totally different. On the other hand, "scalable" receives the same cluster ID as "pineapple", which is not what we'd like. """ - def __cinit__(self, bytes string, list string_views): + def __cinit__(self, bytes string, list string_views, prob=0.0, cluster=0, + orth_flags=0, dist_flags=0, possible_tags=0): self.string = string self.length = len(string) - self.lex = hash(string) - self.string_views = calloc(len(string_views), sizeof(StringHash)) + self.views = calloc(len(string_views), sizeof(StringHash)) cdef unicode view for i in range(len(string_views)): view = string_views[i] @@ -98,6 +98,12 @@ cdef class Word: corpus. "Often" is chosen by heuristic. """ return self.possible_tags & (1 << flag) + + +cdef class CasedWord(Word): + def __cinit__(self, bytes string): + string_views = [get_normaized(string), get_word_shape(string), string[-3:]] + Word.__cinit__(self, string, string_views) cpdef bint is_often_uppered(self) except *: '''Check the OFT_UPPER distributional flag for the word.