* Remove Lexeme struct, preparing to rename Word to Lexeme.

This commit is contained in:
Matthew Honnibal 2014-08-24 19:24:42 +02:00
parent ce59526011
commit 88095666dc
6 changed files with 18 additions and 204 deletions

View File

@ -3,14 +3,13 @@ from libcpp.vector cimport vector
from spacy.spacy cimport StringHash from spacy.spacy cimport StringHash
from spacy.spacy cimport Language from spacy.spacy cimport Language
from spacy.word cimport Word from spacy.word cimport LatinWord
from spacy.tokens cimport Tokens
cimport cython cimport cython
cdef class English(spacy.Language): cdef class English(spacy.Language):
cdef int find_split(self, unicode word) cdef int find_split(self, unicode word)
cdef int set_orth(self, unicode word, Word lex) except -1 cdef LatinWord new_lexeme(self, unicode string)
cdef English EN cdef English EN

View File

@ -44,6 +44,9 @@ cimport spacy
cdef class English(spacy.Language): cdef class English(spacy.Language):
cdef LatinWord new_lexeme(self, unicode string):
return LatinWord(string)
cdef int find_split(self, unicode word): cdef int find_split(self, unicode word):
cdef size_t length = len(word) cdef size_t length = len(word)
cdef int i = 0 cdef int i = 0

View File

@ -1,40 +0,0 @@
from libc.stdint cimport uint32_t
from libc.stdint cimport uint64_t
cimport cython
ctypedef int ClusterID
ctypedef uint32_t StringHash
ctypedef size_t LexID
ctypedef char OrthFlags
ctypedef char DistFlags
ctypedef uint64_t TagFlags
cdef struct Lexeme:
StringHash lex
char* string
size_t length
double prob
ClusterID cluster
TagFlags possible_tags
DistFlags dist_flags
OrthFlags orth_flags
StringHash* string_views
cpdef StringHash lex_of(LexID lex_id) except 0
cpdef char first_of(LexID lex_id) except 0
cpdef size_t length_of(LexID lex_id) except 0
cpdef double prob_of(LexID lex_id) except 1
cpdef ClusterID cluster_of(LexID lex_id) except 0
cpdef bint is_often_titled(size_t lex_id)
cpdef bint is_often_uppered(size_t lex_id)
cpdef bint can_tag(LexID lex, TagFlags flag) except *
cpdef bint check_dist_flag(LexID lex, DistFlags flag) except *
cpdef bint check_orth_flag(LexID lex, OrthFlags flag) except *
cpdef StringHash view_of(LexID lex_id, size_t view) except 0

View File

@ -1,155 +0,0 @@
# cython: profile=True
# cython: embedsignature=True
'''Accessors for Lexeme properties, given a lex_id, which is cast to a Lexeme*.
Mostly useful from Python-space. From Cython-space, you can just cast to
Lexeme* yourself.
'''
from __future__ import unicode_literals
from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint64_t
from spacy.spacy cimport StringHash
# Python-visible enum for POS tags
PUNCT = 0
CONJ = 1
NUM = 2
X = 3
DET = 4
ADP = 5
ADJ = 6
ADV = 7
VERB = 8
NOUN = 9
PDT = 10
POS = 11
PRON = 12
PRT = 13
cpdef int set_flags(LexID lex_id, object active_flags) except *:
"""Set orthographic bit flags for a Lexeme.
Args:
lex_id (LexemeID): A reference ID for a Lexeme.
active_flags: A sequence of bits to set as True.
"""
cdef size_t flag
cdef Lexeme* w = <Lexeme*>lex_id
for flag in active_flags:
w.orth_flags |= 1 << flag
cpdef StringHash view_of(LexID lex_id, size_t view) except 0:
return (<Lexeme*>lex_id).string_views[view]
cpdef StringHash lex_of(LexID lex_id) except 0:
'''Access a hash of the word's string.
>>> lex_of(lookup(u'Hi')) == hash(u'Hi')
True
'''
return (<Lexeme*>lex_id).lex
cpdef ClusterID cluster_of(LexID lex_id) except 0:
'''Access an integer representation of the word's Brown cluster.
A Brown cluster is an address into a binary tree, which gives some (noisy)
information about the word's distributional context.
>>> strings = (u'pineapple', u'apple', u'dapple', u'scalable')
>>> token_ids = [lookup(s) for s in strings]
>>> clusters = [cluster_of(t) for t in token_ids]
>>> print ["{0:b"} % cluster_of(t) for t in token_ids]
["100111110110", "100111100100", "01010111011001", "100111110110"]
The clusterings are unideal, but often slightly useful.
"pineapple" and "apple" share a long prefix, indicating a similar meaning,
while "dapple" is totally different. On the other hand, "scalable" receives
the same cluster ID as "pineapple", which is not what we'd like.
'''
return (<Lexeme*>lex_id).cluster
cpdef char first_of(size_t lex_id) except 0:
'''Access the first byte of a utf8 encoding of the word.
>>> lex_id = lookup(u'Hello')
>>> chr(first_of(lex_id))
'H'
'''
return (<Lexeme*>lex_id).string[0]
cpdef size_t length_of(size_t lex_id) except 0:
'''Access the (unicode) length of the word.
'''
cdef Lexeme* word = <Lexeme*>lex_id
return word.length
cpdef double prob_of(size_t lex_id) except 1:
'''Access an estimate of the word's unigram log probability.
Probabilities are calculated from a large text corpus, and smoothed using
simple Good-Turing. Estimates are read from data/en/probabilities, and
can be replaced using spacy.en.load_probabilities.
>>> prob_of(lookup(u'world'))
-20.10340371976182
'''
return (<Lexeme*>lex_id).prob
DEF OFT_UPPER = 1
DEF OFT_TITLE = 2
cpdef bint is_often_uppered(size_t lex_id):
'''Check the OFT_UPPER distributional flag for the word.
The OFT_UPPER flag records whether a lower-cased version of the word
is found in all-upper case frequently in a large sample of text, where
"frequently" is defined as P >= 0.95 (chosen for high mutual information for
POS tagging).
Case statistics are estimated from a large text corpus. Estimates are read
from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
>>> is_often_uppered(lookup(u'nato'))
True
>>> is_often_uppered(lookup(u'the'))
False
'''
return (<Lexeme*>lex_id).dist_flags & (1 << OFT_UPPER)
cpdef bint is_often_titled(size_t lex_id):
'''Check the OFT_TITLE distributional flag for the word.
The OFT_TITLE flag records whether a lower-cased version of the word
is found title-cased (see string.istitle) frequently in a large sample of text,
where "frequently" is defined as P >= 0.3 (chosen for high mutual information for
POS tagging).
Case statistics are estimated from a large text corpus. Estimates are read
from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
>>> is_oft_upper(lookup(u'john'))
True
>>> is_oft_upper(lookup(u'Bill'))
False
'''
return (<Lexeme*>lex_id).dist_flags & (1 << OFT_TITLE)
cpdef bint check_orth_flag(size_t lex_id, OrthFlags flag) except *:
return (<Lexeme*>lex_id).orth_flags & (1 << flag)
cpdef bint check_dist_flag(size_t lex_id, DistFlags flag) except *:
return (<Lexeme*>lex_id).dist_flags & (1 << flag)
cpdef bint can_tag(LexID lex_id, TagFlags flag) except *:
return (<Lexeme*>lex_id).possible_tags & (1 << flag)

View File

@ -29,18 +29,19 @@ cdef enum:
cdef class Word: cdef class Word:
# NB: the readonly keyword refers to _Python_ access. The attributes are # NB: the readonly keyword refers to _Python_ access. The attributes are
# writeable from Cython. # writeable from Cython.
cdef readonly StringHash lex cdef readonly StringHash key
cdef readonly char* string cdef readonly char** utf8_strings
cdef readonly size_t length cdef readonly size_t length
cdef readonly double prob cdef readonly double prob
cdef readonly ClusterID cluster cdef readonly ClusterID cluster
cdef readonly TagFlags possible_tags cdef readonly TagFlags possible_tags
cdef readonly DistFlags dist_flags cdef readonly DistFlags dist_flags
cdef readonly OrthFlags orth_flags cdef readonly OrthFlags orth_flags
cdef StringHash* string_views
cpdef StringHash get_view(self, size_t i) except 0 cpdef StringHash get_view(self, size_t i) except 0
cdef class CasedWord(Word):
cpdef bint can_tag(self, TagFlags flag) except * cpdef bint can_tag(self, TagFlags flag) except *
cpdef bint check_dist_flag(self, DistFlags flag) except * cpdef bint check_dist_flag(self, DistFlags flag) except *
cpdef bint check_orth_flag(self, OrthFlags flag) except * cpdef bint check_orth_flag(self, OrthFlags flag) except *

View File

@ -60,11 +60,11 @@ cdef class Word:
while "dapple" is totally different. On the other hand, "scalable" receives while "dapple" is totally different. On the other hand, "scalable" receives
the same cluster ID as "pineapple", which is not what we'd like. the same cluster ID as "pineapple", which is not what we'd like.
""" """
def __cinit__(self, bytes string, list string_views): def __cinit__(self, bytes string, list string_views, prob=0.0, cluster=0,
orth_flags=0, dist_flags=0, possible_tags=0):
self.string = <char*>string self.string = <char*>string
self.length = len(string) self.length = len(string)
self.lex = hash(string) self.views = <char**>calloc(len(string_views), sizeof(StringHash))
self.string_views = <StringHash*>calloc(len(string_views), sizeof(StringHash))
cdef unicode view cdef unicode view
for i in range(len(string_views)): for i in range(len(string_views)):
view = string_views[i] view = string_views[i]
@ -98,6 +98,12 @@ cdef class Word:
corpus. "Often" is chosen by heuristic. corpus. "Often" is chosen by heuristic.
""" """
return self.possible_tags & (1 << flag) return self.possible_tags & (1 << flag)
cdef class CasedWord(Word):
def __cinit__(self, bytes string):
string_views = [get_normaized(string), get_word_shape(string), string[-3:]]
Word.__cinit__(self, string, string_views)
cpdef bint is_often_uppered(self) except *: cpdef bint is_often_uppered(self) except *:
'''Check the OFT_UPPER distributional flag for the word. '''Check the OFT_UPPER distributional flag for the word.