* Add Word classes

This commit is contained in:
Matthew Honnibal 2014-08-24 18:14:08 +02:00
parent 3b793cf4f7
commit ce59526011
2 changed files with 297 additions and 0 deletions

58
spacy/word.pxd Normal file
View File

@ -0,0 +1,58 @@
from libc.stdint cimport uint32_t
from libc.stdint cimport uint64_t
ctypedef int ClusterID
ctypedef uint32_t StringHash
ctypedef size_t LexID
ctypedef char OrthFlags
ctypedef char DistFlags
ctypedef uint64_t TagFlags
cdef enum OrthFlag:
IS_ALPHA
IS_DIGIT
IS_PUNCT
IS_SPACE
IS_LOWER
IS_UPPER
IS_TITLE
IS_ASCII
cdef enum:
NORM
SHAPE
LAST3
cdef class Word:
# NB: the readonly keyword refers to _Python_ access. The attributes are
# writeable from Cython.
cdef readonly StringHash lex
cdef readonly char* string
cdef readonly size_t length
cdef readonly double prob
cdef readonly ClusterID cluster
cdef readonly TagFlags possible_tags
cdef readonly DistFlags dist_flags
cdef readonly OrthFlags orth_flags
cdef StringHash* string_views
cpdef StringHash get_view(self, size_t i) except 0
cpdef bint can_tag(self, TagFlags flag) except *
cpdef bint check_dist_flag(self, DistFlags flag) except *
cpdef bint check_orth_flag(self, OrthFlags flag) except *
cpdef bint is_often_titled(self) except *
cpdef bint is_often_uppered(self) except *
cpdef bint is_alpha(self) except *
cpdef bint is_digit(self) except *
cpdef bint is_punct(self) except *
cpdef bint is_space(self) except *
cpdef bint is_lower(self) except *
cpdef bint is_upper(self) except *
cpdef bint is_title(self) except *
cpdef bint is_ascii(self) except *

239
spacy/word.pyx Normal file
View File

@ -0,0 +1,239 @@
# cython: profile=True
# cython: embedsignature=True
from libc.stdlib cimport calloc, free
# Python-visible enum for POS tags
PUNCT = 0
CONJ = 1
NUM = 2
X = 3
DET = 4
ADP = 5
ADJ = 6
ADV = 7
VERB = 8
NOUN = 9
PDT = 10
POS = 11
PRON = 12
PRT = 13
DEF OFT_UPPER = 1
DEF OFT_TITLE = 2
cdef class Word:
"""A lexical type.
Attributes:
string (bytes):
A utf8-encoded byte-string for the word.
lex (StringHash):
A hash of the word.
length (size_t):
The (unicode) length of the word.
prob (double):
An estimate of the word's unigram log probability.
Probabilities are calculated from a large text corpus, and smoothed using
simple Good-Turing. Estimates are read from data/en/probabilities, and
can be replaced using spacy.en.load_probabilities.
cluster (int):
An integer representation of the word's Brown cluster.
A Brown cluster is an address into a binary tree, which gives some (noisy)
information about the word's distributional context.
>>> strings = (u'pineapple', u'apple', u'dapple', u'scalable')
>>> print ["{0:b"} % lookup(s).cluster for s in strings]
["100111110110", "100111100100", "01010111011001", "100111110110"]
The clusterings are unideal, but often slightly useful.
"pineapple" and "apple" share a long prefix, indicating a similar meaning,
while "dapple" is totally different. On the other hand, "scalable" receives
the same cluster ID as "pineapple", which is not what we'd like.
"""
def __cinit__(self, bytes string, list string_views):
self.string = <char*>string
self.length = len(string)
self.lex = hash(string)
self.string_views = <StringHash*>calloc(len(string_views), sizeof(StringHash))
cdef unicode view
for i in range(len(string_views)):
view = string_views[i]
self.string_views[i] = hash(view)
def __dealloc__(self):
free(self.string_views)
cpdef StringHash get_view(self, size_t i) except 0:
return self.string_views[i]
cpdef bint check_orth_flag(self, OrthFlags flag) except *:
"""Access the value of one of the pre-computed boolean orthographic features.
Meanings depend on the language-specific orthographic features being loaded.
The suggested features for latin-alphabet languages are: TODO
"""
return self.orth_flags & (1 << flag)
cpdef bint check_dist_flag(self, DistFlags flag) except *:
"""Access the value of one of the pre-computed boolean distribution features.
Meanings depend on the language-specific distributional features being loaded.
The suggested features for latin-alphabet languages are: TODO
"""
return self.dist_flags & (1 << flag)
cpdef bint can_tag(self, TagFlags flag) except *:
"""Check whether the word often receives a particular tag in a large text
corpus. "Often" is chosen by heuristic.
"""
return self.possible_tags & (1 << flag)
cpdef bint is_often_uppered(self) except *:
'''Check the OFT_UPPER distributional flag for the word.
The OFT_UPPER flag records whether a lower-cased version of the word
is found in all-upper case frequently in a large sample of text, where
"frequently" is defined as P >= 0.95 (chosen for high mutual information for
POS tagging).
Case statistics are estimated from a large text corpus. Estimates are read
from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
>>> is_often_uppered(lookup(u'nato'))
True
>>> is_often_uppered(lookup(u'the'))
False
'''
return self.dist_flags & (1 << OFT_UPPER)
cpdef bint is_often_titled(self) except *:
'''Check the OFT_TITLE distributional flag for the word.
The OFT_TITLE flag records whether a lower-cased version of the word
is found title-cased (see string.istitle) frequently in a large sample of text,
where "frequently" is defined as P >= 0.3 (chosen for high mutual information for
POS tagging).
Case statistics are estimated from a large text corpus. Estimates are read
from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
>>> is_oft_upper(lookup(u'john'))
True
>>> is_oft_upper(lookup(u'Bill'))
False
'''
return self.dist_flags & (1 << OFT_TITLE)
cpdef bint is_alpha(self) except *:
"""Check whether all characters in the word's string are alphabetic.
Should match the :py:func:`unicode.isalpha()` function.
>>> is_alpha(lookup(u'Hello'))
True
>>> is_alpha(lookup(u'العرب'))
True
>>> is_alpha(lookup(u'10'))
False
"""
return self.orth_flags & 1 << IS_ALPHA
cpdef bint is_digit(self) except *:
"""Check whether all characters in the word's string are numeric.
Should match the :py:func:`unicode.isdigit()` function.
>>> is_digit(lookup(u'10'))
True
>>> is_digit(lookup(u''))
True
>>> is_digit(lookup(u'one'))
False
"""
return self.orth_flags & 1 << IS_DIGIT
cpdef bint is_punct(self) except *:
"""Check whether all characters belong to a punctuation unicode data category
for a Lexeme ID.
>>> is_punct(lookup(u'.'))
True
>>> is_punct(lookup(u''))
True
>>> is_punct(lookup(u' '))
False
"""
return self.orth_flags & 1 << IS_PUNCT
cpdef bint is_space(self) except *:
"""Give the result of unicode.isspace() for a Lexeme ID.
>>> is_space(lookup(u'\\t'))
True
>>> is_space(lookup(u'<unicode space>'))
True
>>> is_space(lookup(u'Hi\\n'))
False
"""
return self.orth_flags & 1 << IS_SPACE
cpdef bint is_lower(self) except *:
"""Give the result of unicode.islower() for a Lexeme ID.
>>> is_lower(lookup(u'hi'))
True
>>> is_lower(lookup(<unicode>))
True
>>> is_lower(lookup(u'10'))
False
"""
return self.orth_flags & 1 << IS_LOWER
cpdef bint is_upper(self) except *:
"""Give the result of unicode.isupper() for a Lexeme ID.
>>> is_upper(lookup(u'HI'))
True
>>> is_upper(lookup(u'H10'))
True
>>> is_upper(lookup(u'10'))
False
"""
return self.orth_flags & 1 << IS_UPPER
cpdef bint is_title(self) except *:
"""Give the result of unicode.istitle() for a Lexeme ID.
>>> is_title(lookup(u'Hi'))
True
>>> is_title(lookup(u'Hi1'))
True
>>> is_title(lookup(u'1'))
False
"""
return self.orth_flags & 1 << IS_TITLE
cpdef bint is_ascii(self) except *:
"""Give the result of checking whether all characters in the string are ascii.
>>> is_ascii(lookup(u'Hi'))
True
>>> is_ascii(lookup(u' '))
True
>>> is_title(lookup(u'<unicode>'))
False
"""
return self.orth_flags & 1 << IS_ASCII