* More refactoring

This commit is contained in:
Matthew Honnibal 2014-08-25 16:42:22 +02:00
parent 88095666dc
commit 68bae2fec6
18 changed files with 358 additions and 864 deletions

View File

@ -45,13 +45,13 @@ else:
exts = [
#Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
Extension("spacy.en", ["spacy/en.pyx"], language="c++",
include_dirs=includes),
Extension("spacy.ptb3", ["spacy/ptb3.pyx"], language="c++", include_dirs=includes),
Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes),
Extension("spacy.lang", ["spacy/spacy.pyx"], language="c++", include_dirs=includes),
Extension("spacy.word", ["spacy/word.pyx"], language="c++",
include_dirs=includes),
Extension("spacy.ptb3", ["spacy/ptb3.pyx"], language="c++", include_dirs=includes),
Extension("spacy.en", ["spacy/en.pyx"], language="c++",
include_dirs=includes),
]

View File

@ -1,25 +0,0 @@
from libc.stdint cimport uint64_t
from chartree cimport CharTree
cdef class FixedTable:
cdef size_t size
cdef uint64_t* keys
cdef size_t* values
cdef size_t insert(self, uint64_t key, size_t value) nogil
cdef size_t get(self, uint64_t key) nogil
cdef int erase(self, uint64_t key) nogil
cdef class WordTree:
cdef size_t max_length
cdef size_t default
cdef CharTree* _trees
cdef dict _dict
cdef size_t get(self, unicode string) except *
cdef int set(self, unicode string, size_t value) except *
cdef bint contains(self, unicode string) except *

View File

@ -1,98 +0,0 @@
from libc.stdlib cimport calloc, free
import cython
cimport chartree
cdef class FixedTable:
def __cinit__(self, const size_t size):
self.size = size
self.keys = <uint64_t*>calloc(self.size, sizeof(uint64_t))
self.values = <size_t*>calloc(self.size, sizeof(size_t))
def __dealloc__(self):
free(self.keys)
free(self.values)
def __getitem__(self, uint64_t key):
return self.get(key)
def __setitem__(self, uint64_t key, size_t value):
self.insert(key, value)
def pop(self, uint64_t key):
self.delete(key)
def bucket(self, uint64_t key):
return _find(key, self.size)
cdef size_t insert(self, uint64_t key, size_t value) nogil:
cdef size_t bucket = _find(key, self.size)
cdef size_t clobbered
if self.values[bucket] == value:
clobbered = 0
else:
clobbered = self.values[bucket]
self.keys[bucket] = key
self.values[bucket] = value
return clobbered
cdef size_t get(self, uint64_t key) nogil:
cdef size_t bucket = _find(key, self.size)
if self.keys[bucket] == key:
return self.values[bucket]
else:
return 0
cdef int erase(self, uint64_t key) nogil:
cdef size_t bucket = _find(key, self.size)
self.keys[bucket] = 0
self.values[bucket] = 0
@cython.cdivision
cdef inline size_t _find(uint64_t key, size_t size) nogil:
return key % size
cdef class WordTree:
def __cinit__(self, size_t default, size_t max_length):
self.max_length = max_length
self.default = default
self._trees = <CharTree*>calloc(max_length, sizeof(CharTree))
for i in range(self.max_length):
chartree.init(&self._trees[i], i)
self._dict = {}
cdef size_t get(self, unicode ustring) except *:
cdef bytes bstring = ustring.encode('utf8')
cdef size_t length = len(bstring)
if length >= self.max_length:
return self._dict.get(bstring, 0)
else:
return chartree.getitem(&self._trees[length], bstring)
cdef int set(self, unicode ustring, size_t value) except *:
cdef bytes bstring = ustring.encode('utf8')
cdef size_t length = len(bstring)
if length >= self.max_length:
self._dict[bstring] = value
else:
chartree.setitem(&self._trees[length], bstring, value)
cdef bint contains(self, unicode ustring) except *:
cdef bytes bstring = ustring.encode('utf8')
cdef size_t length = len(bstring)
if length >= self.max_length:
return bstring in self._dict
else:
return chartree.contains(&self._trees[length], bstring)
def __getitem__(self, unicode key):
return self.get(key)
def __setitem__(self, unicode key, size_t value):
self.set(key, value)
def __contains__(self, unicode key):
return self.contains(key)

View File

@ -1,15 +1,38 @@
from libcpp.vector cimport vector
from spacy.spacy cimport StringHash
from spacy.spacy cimport Language
from spacy.word cimport LatinWord
from spacy.word cimport Lexeme
cimport cython
cpdef size_t ALPHA
cpdef size_t DIGIT
cpdef size_t PUNCT
cpdef size_t SPACE
cpdef size_t LOWER
cpdef size_t UPPER
cpdef size_t TITLE
cpdef size_t ASCII
cpdef size_t OFT_LOWER
cpdef size_t OFT_TITLE
cpdef size_t OFT_UPPER
cpdef size_t PUNCT
cpdef size_t CONJ
cpdef size_t NUM
cpdef size_t N
cpdef size_t DET
cpdef size_t ADP
cpdef size_t ADJ
cpdef size_t ADV
cpdef size_t VERB
cpdef size_t NOUN
cpdef size_t PDT
cpdef size_t POS
cpdef size_t PRON
cpdef size_t PRT
cdef class English(spacy.Language):
cdef int find_split(self, unicode word)
cdef LatinWord new_lexeme(self, unicode string)
cdef English EN
@ -17,4 +40,3 @@ cdef English EN
cpdef Word lookup(unicode word)
cpdef list tokenize(unicode string)
cpdef unicode unhash(StringHash hash_value)

View File

@ -43,9 +43,85 @@ from libc.stdint cimport uint64_t
cimport spacy
# Python-readable flag constants --- can't read an enum from Python
# Don't want to manually assign these numbers, or we'll insert one and have to
# change them all.
# Don't use "i", as we don't want it in the global scope!
cdef size_t __i = 0
ALPHA = __i; i += 1
DIGIT = __i; __i += 1
PUNCT = __i; __i += 1
SPACE = __i; __i += 1
LOWER = __i; __i += 1
UPPER = __i; __i += 1
TITLE = __i; __i += 1
ASCII = __i; __i += 1
OFT_LOWER = __i; __i += 1
OFT_UPPER = __i; __i += 1
OFT_TITLE = __i; __i += 1
PUNCT = __i; __i += 1
CONJ = __i; __i += 1
NUM = __i; __i += 1
X = __i; __i += 1
DET = __i; __i += 1
ADP = __i; __i += 1
ADJ = __i; __i += 1
ADV = __i; __i += 1
VERB = __i; __i += 1
NOUN = __i; __i += 1
PDT = __i; __i += 1
POS = __i; __i += 1
PRON = __i; __i += 1
PRT = __i; __i += 1
# These are for the string views
__i = 0
SIC = __i; __i += 1
CANON_CASED = __i; __i += 1
NON_SPARSE = __i; __i += 1
SHAPE = __i; __i += 1
NR_STRING_VIEWS = __i
def get_string_views(unicode string, lexeme):
views = ['' for _ in range(NR_STRING_VIEWS)]
views[SIC] = string
views[CANON_CASED] = canonicalize_case(string, lexeme)
views[SHAPE] = get_string_shape(string)
views[NON_SPARSE] = get_non_sparse(string, views[CANON_CASED], views[SHAPE],
lexeme)
return views
def set_orth_flags(unicode string, flags_t flags)
setters = [
(ALPHA, is_alpha),
(DIGIT, is_digit),
(PUNCT, is_punct),
(SPACE, is_space),
(LOWER, is_lower),
(UPPER, is_upper),
(SPACE, is_space)
]
for bit, setter in setters:
if setter(string):
flags |= 1 << bit
return flags
cdef class English(spacy.Language):
cdef LatinWord new_lexeme(self, unicode string):
return LatinWord(string)
cdef Lexeme new_lexeme(self, unicode string, cluster=0, prob=0, case_stats=None,
tag_freqs=None):
return Lexeme(s, length, views, prob=prob, cluster=cluster,
flags=self.get_flags(string))
cdef int find_split(self, unicode word):
cdef size_t length = len(word)
@ -101,7 +177,7 @@ cpdef list tokenize(unicode string):
return EN.tokenize(string)
cpdef Word lookup(unicode string):
cpdef Lexeme lookup(unicode string):
"""Retrieve (or create, if not found) a Lexeme for a string, and return its ID.
Properties of the Lexeme are accessed by passing LexID to the accessor methods.
@ -116,23 +192,6 @@ cpdef Word lookup(unicode string):
return EN.lookup(string)
cpdef unicode unhash(StringHash hash_value):
"""Retrieve a string from a hash value. Mostly used for testing.
In general you should avoid computing with strings, as they are slower than
the intended ID-based usage. However, strings can be recovered if necessary,
although no control is taken for hash collisions.
Args:
hash_value (StringHash): The hash of a string, returned by Python's hash()
function.
Returns:
string (unicode): A unicode string that hashes to the hash_value.
"""
return EN.unhash(hash_value)
def add_string_views(view_funcs):
"""Add a string view to existing and previous lexical entries.
@ -150,16 +209,19 @@ def load_clusters(location):
"""
pass
def load_unigram_probs(location):
"""Load unigram probabilities.
"""
pass
def load_case_stats(location):
"""Load case stats.
"""
pass
def load_tag_stats(location):
"""Load tag statistics.
"""

View File

@ -1,16 +1,12 @@
from libc.stdint cimport uint32_t
from libc.stdint cimport uint64_t
from spacy.word cimport Word
ctypedef uint32_t StringHash
from spacy.word cimport Lexeme
cdef class Language:
cdef object name
cdef dict chunks
cdef dict vocab
cdef dict bacov
cdef dict blobs
cdef dict lexicon
cpdef list tokenize(self, unicode text)
@ -20,8 +16,5 @@ cdef class Language:
cdef list new_chunk(self, unicode string, list substrings)
cdef Word new_lexeme(self, unicode lex)
cpdef unicode unhash(self, StringHash hashed)
cpdef list find_substrings(self, unicode chunk)
cdef int find_split(self, unicode word)
cdef int set_orth(self, unicode string, Word word)

View File

@ -15,16 +15,13 @@ from libc.stdlib cimport calloc, free
from . import util
from os import path
TAGS = {}
DIST_FLAGS = {}
cdef class Language:
view_funcs = []
def __cinit__(self, name):
self.name = name
self.bacov = {}
self.chunks = {}
self.vocab = {}
self.blobs = {}
self.lexicon = {}
self.load_tokenization(util.read_tokenization(name))
self.load_dist_info(util.read_dist_info(name))
@ -37,26 +34,26 @@ cdef class Language:
string (unicode): The string to split.
Returns:
tokens (Tokens): A Tokens object.
tokens (list): A list of Lexeme objects.
"""
cdef list chunk
cdef list blob
cdef list tokens = []
cdef size_t length = len(string)
cdef size_t start = 0
cdef size_t i = 0
for c in string:
if _is_whitespace(c):
if c == ' ':
if start < i:
chunk = self.lookup_chunk(string[start:i])
tokens.extend(chunk)
blob = self.lookup_blob(string[start:i])
tokens.extend(blob)
start = i + 1
i += 1
if start < i:
chunk = self.lookup_chunk(string[start:])
chunk = self.lookup_blob(string[start:])
tokens.extend(chunk)
return tokens
cdef Word lookup(self, unicode string):
cdef Lexeme lookup(self, unicode string):
assert len(string) != 0
cdef Word word
if string in self.vocab:
@ -65,28 +62,26 @@ cdef class Language:
word = self.new_lexeme(string)
return word
cdef list lookup_chunk(self, unicode string):
cdef list lookup_blob(self, unicode string):
cdef list chunk
cdef size_t chunk_id
if string in self.chunks:
chunk = self.chunks[string]
cdef size_t blob_id
if string in self.blobs:
blob = self.blobs[string]
else:
chunk = self.new_chunk(string, self.find_substrings(string))
blob = self.new_blob(string, self.find_substrings(string))
return chunk
cdef list new_chunk(self, unicode string, list substrings):
chunk = []
cdef list new_blob(self, unicode string, list substrings):
blob = []
for i, substring in enumerate(substrings):
chunk.append(self.lookup(substring))
self.chunks[string] = chunk
return chunk
blob.append(self.lookup(substring))
self.blobs[string] = chunk
return blob
cdef Word new_lexeme(self, unicode string):
string_views = [view_func(string) for view_func in self.view_funcs]
word = Word(string.encode('utf8'), string_views)
self.bacov[word.lex] = string
self.vocab[string] = word
return word
# TODO
#lexeme = Lexeme(string.encode('utf8'), string_views)
#return lexeme
"""
def add_view_funcs(self, list view_funcs):
@ -112,11 +107,7 @@ cdef class Language:
self.bacov[hashed] = view
"""
cpdef unicode unhash(self, StringHash hash_value):
'''Fetch a string from the reverse index, given its hash value.'''
return self.bacov[hash_value]
cpdef list find_substrings(self, unicode chunk):
cpdef list find_substrings(self, unicode blob):
"""Find how to split a chunk into substrings.
This method calls find_split repeatedly. Most languages will want to
@ -129,21 +120,18 @@ cdef class Language:
substrings (list): The component substrings, e.g. [u"Mike", "'s", "!"].
"""
substrings = []
while chunk:
split = self.find_split(chunk)
while blob:
split = self.find_split(blob)
if split == 0:
substrings.append(chunk)
substrings.append(blob)
break
substrings.append(chunk[:split])
chunk = chunk[split:]
substrings.append(blob[:split])
blob = blob[split:]
return substrings
cdef int find_split(self, unicode word):
return len(word)
cdef int set_orth(self, unicode string, Word word):
pass
def load_tokenization(self, token_rules):
'''Load special-case tokenization rules.
@ -178,22 +166,3 @@ cdef class Language:
w.dist_flags |= DIST_FLAGS[flag]
for tag in word_dist.tagdict:
w.possible_tags |= TAGS[tag]
cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
if c == ' ':
return True
elif c == '\n':
return True
elif c == '\t':
return True
else:
return False
#cdef inline int _extend(Tokens tokens, Lexeme** chunk) nogil:
# cdef size_t i = 0
# while chunk[i] != NULL:
# tokens.vctr[0].push_back(<Lexeme_addr>chunk[i])
# tokens.length += 1
# i += 1

View File

@ -1,32 +0,0 @@
cdef enum OrthFlag:
IS_ALPHA
IS_DIGIT
IS_PUNCT
IS_SPACE
IS_LOWER
IS_UPPER
IS_TITLE
IS_ASCII
cdef enum:
NORM
SHAPE
LAST3
from spacy.lexeme cimport LexID
from spacy.lexeme cimport StringHash
cpdef bint is_alpha(LexID lex_id) except *
cpdef bint is_digit(LexID lex_id) except *
cpdef bint is_punct(LexID lex_id) except *
cpdef bint is_space(LexID lex_id) except *
cpdef bint is_lower(LexID lex_id) except *
cpdef bint is_upper(LexID lex_id) except *
cpdef bint is_title(LexID lex_id) except *
cpdef bint is_ascii(LexID lex_id) except *
cpdef StringHash norm_of(LexID lex_id) except 0
cpdef StringHash shape_of(LexID lex_id) except 0
cpdef StringHash last3_of(LexID lex_id) except 0

View File

@ -1,211 +0,0 @@
# cython: embedsignature=True
from __future__ import unicode_literals
from spacy.lexeme cimport Lexeme
def get_normalized(unicode word):
"""Todo.
Args:
word (unicode)
Returns:
normalized (unicode)
"""
if word.isalpha() and word.islower():
return word
else:
return get_word_shape(word)
def get_word_shape(unicode word):
"""Todo.
Args:
word (unicode)
Returns:
shape (unicode)
"""
cdef size_t length = len(word)
shape = ""
last = ""
shape_char = ""
seq = 0
for c in word:
if c.isalpha():
if c.isupper():
shape_char = "X"
else:
shape_char = "x"
elif c.isdigit():
shape_char = "d"
else:
shape_char = c
if shape_char == last:
seq += 1
else:
seq = 0
last = shape_char
if seq < 3:
shape += shape_char
assert shape
return shape
cpdef unicode get_last3(unicode string):
return string[-3:]
cpdef bint is_alpha(LexID lex_id) except *:
"""Check whether all characters in the word's string are alphabetic.
Should match the :py:func:`unicode.isalpha()` function.
>>> is_alpha(lookup(u'Hello'))
True
>>> is_alpha(lookup(u'العرب'))
True
>>> is_alpha(lookup(u'10'))
False
"""
return (<Lexeme*>lex_id).orth_flags & 1 << IS_ALPHA
cpdef bint is_digit(LexID lex_id) except *:
"""Check whether all characters in the word's string are numeric.
Should match the :py:func:`unicode.isdigit()` function.
>>> is_digit(lookup(u'10'))
True
>>> is_digit(lookup(u''))
True
>>> is_digit(lookup(u'one'))
False
"""
return (<Lexeme*>lex_id).orth_flags & 1 << IS_DIGIT
cpdef bint is_punct(LexID lex_id) except *:
"""Check whether all characters belong to a punctuation unicode data category
for a Lexeme ID.
>>> is_punct(lookup(u'.'))
True
>>> is_punct(lookup(u''))
True
>>> is_punct(lookup(u' '))
False
"""
return (<Lexeme*>lex_id).orth_flags & 1 << IS_PUNCT
cpdef bint is_space(LexID lex_id) except *:
"""Give the result of unicode.isspace() for a Lexeme ID.
>>> is_space(lookup(u'\\t'))
True
>>> is_space(lookup(u'<unicode space>'))
True
>>> is_space(lookup(u'Hi\\n'))
False
"""
return (<Lexeme*>lex_id).orth_flags & 1 << IS_SPACE
cpdef bint is_lower(LexID lex_id) except *:
"""Give the result of unicode.islower() for a Lexeme ID.
>>> is_lower(lookup(u'hi'))
True
>>> is_lower(lookup(<unicode>))
True
>>> is_lower(lookup(u'10'))
False
"""
return (<Lexeme*>lex_id).orth_flags & 1 << IS_LOWER
cpdef bint is_upper(LexID lex_id) except *:
"""Give the result of unicode.isupper() for a Lexeme ID.
>>> is_upper(lookup(u'HI'))
True
>>> is_upper(lookup(u'H10'))
True
>>> is_upper(lookup(u'10'))
False
"""
return (<Lexeme*>lex_id).orth_flags & 1 << IS_UPPER
cpdef bint is_title(LexID lex_id) except *:
"""Give the result of unicode.istitle() for a Lexeme ID.
>>> is_title(lookup(u'Hi'))
True
>>> is_title(lookup(u'Hi1'))
True
>>> is_title(lookup(u'1'))
False
"""
return (<Lexeme*>lex_id).orth_flags & 1 << IS_TITLE
cpdef bint is_ascii(LexID lex_id) except *:
"""Give the result of checking whether all characters in the string are ascii.
>>> is_ascii(lookup(u'Hi'))
True
>>> is_ascii(lookup(u' '))
True
>>> is_title(lookup(u'<unicode>'))
False
"""
return (<Lexeme*>lex_id).orth_flags & 1 << IS_ASCII
cpdef StringHash norm_of(LexID lex_id) except 0:
"""Return the hash of a "normalized" version of the string.
Normalized strings are intended to be less sparse, while still capturing
important lexical information. See :py:func:`spacy.latin.orthography.normalize_string`
for details of the normalization function.
>>> unhash(norm_of(lookupu'Hi'))
u'hi'
>>> unhash(norm_of(lookup(u'255667')))
u'shape=dddd'
>>> unhash(norm_of(lookup(u'...')))
u'...'
"""
return (<Lexeme*>lex_id).string_views[NORM]
cpdef StringHash shape_of(LexID lex_id) except 0:
"""Return the hash of a string describing the word's "orthograpgic shape".
Orthographic shapes are calculated by the :py:func:`spacy.orthography.latin.string_shape`
function. Word shape features have been found useful for NER and POS tagging,
e.g. Manning (2011)
>>> unhash(shape_of(lookupu'Hi'))
u'Xx'
>>> unhash(shape_of(lookup(u'255667')))
u'dddd'
>>> unhash(shape_of(lookup(u'...')))
u'...'
"""
cdef Lexeme* w = <Lexeme*>lex_id
return w.string_views[SHAPE]
cpdef StringHash last3_of(LexID lex_id) except 0:
'''Return the hash of string[-3:], i.e. the last three characters of the word.
>>> lex_ids = [lookup(w) for w in (u'Hello', u'!')]
>>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
[u'llo', u'!']
'''
return (<Lexeme*>lex_id).string_views[LAST3]

View File

@ -1,7 +0,0 @@
cpdef bytes to_bytes(unicode string)
cpdef unicode from_bytes(bytes string)
cpdef unicode substr(unicode string, int start, int end, size_t length)
cdef bint is_whitespace(Py_UNICODE c)

View File

@ -1,35 +0,0 @@
# cython: profile=True
cpdef bytes to_bytes(unicode string):
return string.encode('utf8')
cpdef unicode from_bytes(bytes string):
return string.decode('utf8')
cpdef unicode substr(unicode string, int start, int end, size_t length):
if end >= length:
end = -1
if start >= length:
start = 0
if start <= 0 and end < 0:
return string
elif start < 0:
start = 0
elif end < 0:
end = length
return string[start:end]
cdef bint is_whitespace(Py_UNICODE c):
# TODO: Support other unicode spaces
# https://www.cs.tut.fi/~jkorpela/chars/spaces.html
if c == u' ':
return True
elif c == u'\n':
return True
elif c == u'\t':
return True
else:
return False

View File

@ -1,18 +0,0 @@
from libcpp.vector cimport vector
from spacy.lexeme cimport LexID
from spacy.lexeme cimport Lexeme
from cython.operator cimport dereference as deref
from spacy.spacy cimport Language
cdef class Tokens:
cdef Language lang
cdef vector[LexID]* vctr
cdef size_t length
cpdef int append(self, LexID token)
cpdef int extend(self, Tokens other) except -1
cpdef object group_by(self, size_t attr)
cpdef dict count_by(self, size_t attr)

View File

@ -1,92 +0,0 @@
from cython.operator cimport dereference as deref
from cython.operator cimport preincrement as inc
from spacy.lexeme cimport Lexeme
from spacy.spacy cimport StringHash
cdef class Tokens:
def __cinit__(self, Language lang):
self.lang = lang
self.vctr = new vector[LexID]()
self.length = 0
def __dealloc__(self):
del self.vctr
def __iter__(self):
cdef vector[LexID].iterator it = self.vctr[0].begin()
while it != self.vctr[0].end():
yield deref(it)
inc(it)
def __getitem__(self, size_t idx):
return self.vctr[0].at(idx)
def __len__(self):
return self.length
cpdef int append(self, LexID token):
self.vctr[0].push_back(token)
self.length += 1
cpdef int extend(self, Tokens other) except -1:
cdef LexID el
for el in other:
self.append(el)
cpdef object group_by(self, size_t view_idx):
'''Group tokens that share the property attr into Tokens instances, and
return a list of them. Returns a tuple of three lists:
(string names, hashes, tokens)
The lists are aligned, so the ith entry in string names is the string
that the ith entry in hashes unhashes to, which the Tokens instance
is grouped by.
You can then use count_by or group_by on the Tokens
for further processing. Calling group_by and then asking the length
of the Tokens objects is equivalent to count_by, but somewhat slower.
'''
# Implementation here is working around some of the constraints in
# Cython about what type of thing can go in what type of container.
# Long story short, it's pretty hard to get a Python object like
# Tokens into a vector or array. If we really need this to run faster,
# we can be tricky and get the Python list access out of the loop. What
# we'd do is store pointers to the underlying vectors.
# So far, speed isn't mattering here.
cdef dict indices = {}
cdef list groups = []
cdef list names = []
cdef list hashes = []
cdef StringHash key
cdef LexID t
for t in self.vctr[0]:
if view_idx == 0:
key = (<Lexeme*>t).lex
else:
key = (<Lexeme*>t).string_views[view_idx - 1]
if key in indices:
groups[indices[key]].append(t)
else:
indices[key] = len(groups)
groups.append(Tokens(self.lang))
names.append(self.lang.unhash(key))
hashes.append(key)
groups[-1].append(t)
return names, hashes, groups
cpdef dict count_by(self, size_t attr):
counts = {}
cdef LexID t
cdef StringHash key
for t in self.vctr[0]:
#key = attr_of(t, attr)
key = 0
if key not in counts:
counts[key] = 0
counts[key] += 1
return counts

View File

@ -1,59 +1,25 @@
from libc.stdint cimport uint32_t
from libc.stdint cimport uint64_t
ctypedef int ClusterID
ctypedef uint32_t StringHash
ctypedef size_t LexID
ctypedef char OrthFlags
ctypedef char DistFlags
ctypedef uint64_t TagFlags
from .typedefs cimport hash_t, utf8_t, flag_t, id_t
cdef enum OrthFlag:
IS_ALPHA
IS_DIGIT
IS_PUNCT
IS_SPACE
IS_LOWER
IS_UPPER
IS_TITLE
IS_ASCII
DEF MAX_FLAG = 64
cdef enum:
NORM
SHAPE
LAST3
cdef class Word:
cdef class Lexeme:
# NB: the readonly keyword refers to _Python_ access. The attributes are
# writeable from Cython.
cdef readonly StringHash key
cdef readonly char** utf8_strings
cdef readonly id_t id
cdef readonly size_t length
cdef readonly double prob
cdef readonly ClusterID cluster
cdef readonly TagFlags possible_tags
cdef readonly DistFlags dist_flags
cdef readonly OrthFlags orth_flags
cdef readonly size_t cluster
cpdef StringHash get_view(self, size_t i) except 0
cdef readonly utf8_t* strings
cdef readonly size_t nr_strings
cdef readonly flag_t flags
cdef class CasedWord(Word):
cpdef bint can_tag(self, TagFlags flag) except *
cpdef bint check_dist_flag(self, DistFlags flag) except *
cpdef bint check_orth_flag(self, OrthFlags flag) except *
cpdef bint is_often_titled(self) except *
cpdef bint is_often_uppered(self) except *
cpdef bint is_alpha(self) except *
cpdef bint is_digit(self) except *
cpdef bint is_punct(self) except *
cpdef bint is_space(self) except *
cpdef bint is_lower(self) except *
cpdef bint is_upper(self) except *
cpdef bint is_title(self) except *
cpdef bint is_ascii(self) except *
cpdef bint check_flag(self, size_t flag_id) except *
cpdef int set_flag(self, size_t flag_id) except -1
cpdef unicode get_string(self, size_t i) except *
cpdef id_t get_id(self, size_t i) except 0
cpdef int add_strings(self, list strings) except -1

View File

@ -4,40 +4,32 @@
from libc.stdlib cimport calloc, free
# Python-visible enum for POS tags
PUNCT = 0
CONJ = 1
NUM = 2
X = 3
DET = 4
ADP = 5
ADJ = 6
ADV = 7
VERB = 8
NOUN = 9
PDT = 10
POS = 11
PRON = 12
PRT = 13
from spacy cimport flags
DEF OFT_UPPER = 1
DEF OFT_TITLE = 2
cdef class Word:
cdef class Lexeme:
"""A lexical type.
Clients should avoid instantiating Lexemes directly, and instead use get_lexeme
from a language module, e.g. spacy.en.get_lexeme . This allows us to use only
one Lexeme object per lexical type.
Attributes:
string (bytes):
A utf8-encoded byte-string for the word.
lex (StringHash):
A hash of the word.
id (view_id_t):
A unique ID of the word's string.
Implemented as the memory-address of the string,
as we use Python's string interning to guarantee that only one copy
of each string is seen.
string (unicode):
The unicode string.
Implemented as a property; relatively expensive.
length (size_t):
The (unicode) length of the word.
The number of unicode code-points in the string.
prob (double):
An estimate of the word's unigram log probability.
@ -60,186 +52,194 @@ cdef class Word:
while "dapple" is totally different. On the other hand, "scalable" receives
the same cluster ID as "pineapple", which is not what we'd like.
"""
def __cinit__(self, bytes string, list string_views, prob=0.0, cluster=0,
orth_flags=0, dist_flags=0, possible_tags=0):
self.string = <char*>string
self.length = len(string)
self.views = <char**>calloc(len(string_views), sizeof(StringHash))
cdef unicode view
for i in range(len(string_views)):
view = string_views[i]
self.string_views[i] = hash(view)
def __cinit__(self, utf8_t string, size_t length, list views, prob=0.0,
cluster=0, orth_flags=0, dist_flags=0, possible_tags=0):
self.id = <id_t>&string
self.length = length
self.nr_strings = 0
self.add_views(views)
def __dealloc__(self):
free(self.string_views)
free(self.views)
cpdef StringHash get_view(self, size_t i) except 0:
return self.string_views[i]
property string:
def __get__(self):
return self.strings[0].decode('utf8')
cpdef bint check_orth_flag(self, OrthFlags flag) except *:
"""Access the value of one of the pre-computed boolean orthographic features.
cpdef unicode get_view_string(self, size_t i) except *:
assert i < self.nr_strings
return self.strings[i].decode('utf8')
Meanings depend on the language-specific orthographic features being loaded.
The suggested features for latin-alphabet languages are: TODO
"""
return self.orth_flags & (1 << flag)
cpdef intptr_t get_view_id(self, size_t i) except 0:
assert i < self.nr_strings
return <string_id_t>&self.views[i]
cpdef bint check_dist_flag(self, DistFlags flag) except *:
cpdef int add_views(self, list views) except -1:
self.nr_views += len(strings)
self.views = <char**>realloc(self.views, self.nr_views * sizeof(utf8_t))
cdef unicode view
cdef bytes utf8_string
for i, view in enumerate(strings):
view = string_views[i]
utf8_string = view.encode('utf8')
# Intern strings, allowing pointer comparison
utf8_string = intern(utf8_string)
self.views[i] = utf8_string
cpdef bint check_flag(self, size_t flag_id) except *:
"""Access the value of one of the pre-computed boolean distribution features.
Meanings depend on the language-specific distributional features being loaded.
The suggested features for latin-alphabet languages are: TODO
"""
return self.dist_flags & (1 << flag)
assert flag_id < flags.MAX_FLAG
return self.flags & (1 << flag_id)
cpdef bint can_tag(self, TagFlags flag) except *:
"""Check whether the word often receives a particular tag in a large text
corpus. "Often" is chosen by heuristic.
"""
return self.possible_tags & (1 << flag)
cpdef int set_flag(self, size_t flag_id) except -1:
assert flag_id < flags.MAX_FLAG
self.flags |= (1 << flag_id)
cdef class CasedWord(Word):
def __cinit__(self, bytes string):
string_views = [get_normaized(string), get_word_shape(string), string[-3:]]
Word.__cinit__(self, string, string_views)
cpdef bint is_often_uppered(self) except *:
'''Check the OFT_UPPER distributional flag for the word.
The OFT_UPPER flag records whether a lower-cased version of the word
is found in all-upper case frequently in a large sample of text, where
"frequently" is defined as P >= 0.95 (chosen for high mutual information for
POS tagging).
Case statistics are estimated from a large text corpus. Estimates are read
from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
>>> is_often_uppered(lookup(u'nato'))
True
>>> is_often_uppered(lookup(u'the'))
False
'''
return self.dist_flags & (1 << OFT_UPPER)
cpdef bint is_often_titled(self) except *:
'''Check the OFT_TITLE distributional flag for the word.
The OFT_TITLE flag records whether a lower-cased version of the word
is found title-cased (see string.istitle) frequently in a large sample of text,
where "frequently" is defined as P >= 0.3 (chosen for high mutual information for
POS tagging).
Case statistics are estimated from a large text corpus. Estimates are read
from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
>>> is_oft_upper(lookup(u'john'))
True
>>> is_oft_upper(lookup(u'Bill'))
False
'''
return self.dist_flags & (1 << OFT_TITLE)
cpdef bint is_alpha(self) except *:
"""Check whether all characters in the word's string are alphabetic.
Should match the :py:func:`unicode.isalpha()` function.
>>> is_alpha(lookup(u'Hello'))
True
>>> is_alpha(lookup(u'العرب'))
True
>>> is_alpha(lookup(u'10'))
False
"""
return self.orth_flags & 1 << IS_ALPHA
cpdef bint is_digit(self) except *:
"""Check whether all characters in the word's string are numeric.
Should match the :py:func:`unicode.isdigit()` function.
>>> is_digit(lookup(u'10'))
True
>>> is_digit(lookup(u''))
True
>>> is_digit(lookup(u'one'))
False
"""
return self.orth_flags & 1 << IS_DIGIT
cpdef bint is_punct(self) except *:
"""Check whether all characters belong to a punctuation unicode data category
for a Lexeme ID.
>>> is_punct(lookup(u'.'))
True
>>> is_punct(lookup(u''))
True
>>> is_punct(lookup(u' '))
False
"""
return self.orth_flags & 1 << IS_PUNCT
cpdef bint is_space(self) except *:
"""Give the result of unicode.isspace() for a Lexeme ID.
>>> is_space(lookup(u'\\t'))
True
>>> is_space(lookup(u'<unicode space>'))
True
>>> is_space(lookup(u'Hi\\n'))
False
"""
return self.orth_flags & 1 << IS_SPACE
cpdef bint is_lower(self) except *:
"""Give the result of unicode.islower() for a Lexeme ID.
>>> is_lower(lookup(u'hi'))
True
>>> is_lower(lookup(<unicode>))
True
>>> is_lower(lookup(u'10'))
False
"""
return self.orth_flags & 1 << IS_LOWER
cpdef bint is_upper(self) except *:
"""Give the result of unicode.isupper() for a Lexeme ID.
>>> is_upper(lookup(u'HI'))
True
>>> is_upper(lookup(u'H10'))
True
>>> is_upper(lookup(u'10'))
False
"""
return self.orth_flags & 1 << IS_UPPER
cpdef bint is_title(self) except *:
"""Give the result of unicode.istitle() for a Lexeme ID.
>>> is_title(lookup(u'Hi'))
True
>>> is_title(lookup(u'Hi1'))
True
>>> is_title(lookup(u'1'))
False
"""
return self.orth_flags & 1 << IS_TITLE
cpdef bint is_ascii(self) except *:
"""Give the result of checking whether all characters in the string are ascii.
>>> is_ascii(lookup(u'Hi'))
True
>>> is_ascii(lookup(u' '))
True
>>> is_title(lookup(u'<unicode>'))
False
"""
return self.orth_flags & 1 << IS_ASCII
#
#cdef class CasedWord(Word):
# def __cinit__(self, bytes string, list views):
# Word.__cinit__(self, string, string_views)
#
# cpdef bint is_often_uppered(self) except *:
# '''Check the OFT_UPPER distributional flag for the word.
#
# The OFT_UPPER flag records whether a lower-cased version of the word
# is found in all-upper case frequently in a large sample of text, where
# "frequently" is defined as P >= 0.95 (chosen for high mutual information for
# POS tagging).
#
# Case statistics are estimated from a large text corpus. Estimates are read
# from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
#
# >>> is_often_uppered(lookup(u'nato'))
# True
# >>> is_often_uppered(lookup(u'the'))
# False
# '''
# return self.dist_flags & (1 << OFT_UPPER)
#
#
# cpdef bint is_often_titled(self) except *:
# '''Check the OFT_TITLE distributional flag for the word.
#
# The OFT_TITLE flag records whether a lower-cased version of the word
# is found title-cased (see string.istitle) frequently in a large sample of text,
# where "frequently" is defined as P >= 0.3 (chosen for high mutual information for
# POS tagging).
#
# Case statistics are estimated from a large text corpus. Estimates are read
# from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
#
# >>> is_oft_upper(lookup(u'john'))
# True
# >>> is_oft_upper(lookup(u'Bill'))
# False
# '''
# return self.dist_flags & (1 << OFT_TITLE)
#
#
# cpdef bint is_alpha(self) except *:
# """Check whether all characters in the word's string are alphabetic.
#
# Should match the :py:func:`unicode.isalpha()` function.
#
# >>> is_alpha(lookup(u'Hello'))
# True
# >>> is_alpha(lookup(u'العرب'))
# True
# >>> is_alpha(lookup(u'10'))
# False
# """
# return self.orth_flags & 1 << IS_ALPHA
#
# cpdef bint is_digit(self) except *:
# """Check whether all characters in the word's string are numeric.
#
# Should match the :py:func:`unicode.isdigit()` function.
#
# >>> is_digit(lookup(u'10'))
# True
# >>> is_digit(lookup(u''))
# True
# >>> is_digit(lookup(u'one'))
# False
# """
# return self.orth_flags & 1 << IS_DIGIT
#
# cpdef bint is_punct(self) except *:
# """Check whether all characters belong to a punctuation unicode data category
# for a Lexeme ID.
#
# >>> is_punct(lookup(u'.'))
# True
# >>> is_punct(lookup(u'⁒'))
# True
# >>> is_punct(lookup(u' '))
# False
# """
# return self.orth_flags & 1 << IS_PUNCT
#
# cpdef bint is_space(self) except *:
# """Give the result of unicode.isspace() for a Lexeme ID.
#
# >>> is_space(lookup(u'\\t'))
# True
# >>> is_space(lookup(u'<unicode space>'))
# True
# >>> is_space(lookup(u'Hi\\n'))
# False
# """
# return self.orth_flags & 1 << IS_SPACE
#
# cpdef bint is_lower(self) except *:
# """Give the result of unicode.islower() for a Lexeme ID.
#
# >>> is_lower(lookup(u'hi'))
# True
# >>> is_lower(lookup(<unicode>))
# True
# >>> is_lower(lookup(u'10'))
# False
# """
# return self.orth_flags & 1 << IS_LOWER
#
# cpdef bint is_upper(self) except *:
# """Give the result of unicode.isupper() for a Lexeme ID.
#
# >>> is_upper(lookup(u'HI'))
# True
# >>> is_upper(lookup(u'H10'))
# True
# >>> is_upper(lookup(u'10'))
# False
# """
# return self.orth_flags & 1 << IS_UPPER
#
# cpdef bint is_title(self) except *:
# """Give the result of unicode.istitle() for a Lexeme ID.
#
# >>> is_title(lookup(u'Hi'))
# True
# >>> is_title(lookup(u'Hi1'))
# True
# >>> is_title(lookup(u'1'))
# False
# """
# return self.orth_flags & 1 << IS_TITLE
#
# cpdef bint is_ascii(self) except *:
# """Give the result of checking whether all characters in the string are ascii.
#
# >>> is_ascii(lookup(u'Hi'))
# True
# >>> is_ascii(lookup(u' '))
# True
# >>> is_title(lookup(u'<unicode>'))
# False
# """
# return self.orth_flags & 1 << IS_ASCII