* More refactoring

This commit is contained in:
Matthew Honnibal 2014-08-25 16:42:22 +02:00
parent 88095666dc
commit 68bae2fec6
18 changed files with 358 additions and 864 deletions

View File

@ -45,13 +45,13 @@ else:
exts = [ exts = [
#Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes), Extension("spacy.lang", ["spacy/spacy.pyx"], language="c++", include_dirs=includes),
Extension("spacy.en", ["spacy/en.pyx"], language="c++",
include_dirs=includes),
Extension("spacy.ptb3", ["spacy/ptb3.pyx"], language="c++", include_dirs=includes),
Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes),
Extension("spacy.word", ["spacy/word.pyx"], language="c++", Extension("spacy.word", ["spacy/word.pyx"], language="c++",
include_dirs=includes), include_dirs=includes),
Extension("spacy.ptb3", ["spacy/ptb3.pyx"], language="c++", include_dirs=includes),
Extension("spacy.en", ["spacy/en.pyx"], language="c++",
include_dirs=includes),
] ]

View File

@ -1,25 +0,0 @@
from libc.stdint cimport uint64_t
from chartree cimport CharTree
cdef class FixedTable:
cdef size_t size
cdef uint64_t* keys
cdef size_t* values
cdef size_t insert(self, uint64_t key, size_t value) nogil
cdef size_t get(self, uint64_t key) nogil
cdef int erase(self, uint64_t key) nogil
cdef class WordTree:
cdef size_t max_length
cdef size_t default
cdef CharTree* _trees
cdef dict _dict
cdef size_t get(self, unicode string) except *
cdef int set(self, unicode string, size_t value) except *
cdef bint contains(self, unicode string) except *

View File

@ -1,98 +0,0 @@
from libc.stdlib cimport calloc, free
import cython
cimport chartree
cdef class FixedTable:
def __cinit__(self, const size_t size):
self.size = size
self.keys = <uint64_t*>calloc(self.size, sizeof(uint64_t))
self.values = <size_t*>calloc(self.size, sizeof(size_t))
def __dealloc__(self):
free(self.keys)
free(self.values)
def __getitem__(self, uint64_t key):
return self.get(key)
def __setitem__(self, uint64_t key, size_t value):
self.insert(key, value)
def pop(self, uint64_t key):
self.delete(key)
def bucket(self, uint64_t key):
return _find(key, self.size)
cdef size_t insert(self, uint64_t key, size_t value) nogil:
cdef size_t bucket = _find(key, self.size)
cdef size_t clobbered
if self.values[bucket] == value:
clobbered = 0
else:
clobbered = self.values[bucket]
self.keys[bucket] = key
self.values[bucket] = value
return clobbered
cdef size_t get(self, uint64_t key) nogil:
cdef size_t bucket = _find(key, self.size)
if self.keys[bucket] == key:
return self.values[bucket]
else:
return 0
cdef int erase(self, uint64_t key) nogil:
cdef size_t bucket = _find(key, self.size)
self.keys[bucket] = 0
self.values[bucket] = 0
@cython.cdivision
cdef inline size_t _find(uint64_t key, size_t size) nogil:
return key % size
cdef class WordTree:
def __cinit__(self, size_t default, size_t max_length):
self.max_length = max_length
self.default = default
self._trees = <CharTree*>calloc(max_length, sizeof(CharTree))
for i in range(self.max_length):
chartree.init(&self._trees[i], i)
self._dict = {}
cdef size_t get(self, unicode ustring) except *:
cdef bytes bstring = ustring.encode('utf8')
cdef size_t length = len(bstring)
if length >= self.max_length:
return self._dict.get(bstring, 0)
else:
return chartree.getitem(&self._trees[length], bstring)
cdef int set(self, unicode ustring, size_t value) except *:
cdef bytes bstring = ustring.encode('utf8')
cdef size_t length = len(bstring)
if length >= self.max_length:
self._dict[bstring] = value
else:
chartree.setitem(&self._trees[length], bstring, value)
cdef bint contains(self, unicode ustring) except *:
cdef bytes bstring = ustring.encode('utf8')
cdef size_t length = len(bstring)
if length >= self.max_length:
return bstring in self._dict
else:
return chartree.contains(&self._trees[length], bstring)
def __getitem__(self, unicode key):
return self.get(key)
def __setitem__(self, unicode key, size_t value):
self.set(key, value)
def __contains__(self, unicode key):
return self.contains(key)

View File

@ -1,15 +1,38 @@
from libcpp.vector cimport vector
from spacy.spacy cimport StringHash
from spacy.spacy cimport Language from spacy.spacy cimport Language
from spacy.word cimport LatinWord from spacy.word cimport Lexeme
cimport cython cimport cython
cpdef size_t ALPHA
cpdef size_t DIGIT
cpdef size_t PUNCT
cpdef size_t SPACE
cpdef size_t LOWER
cpdef size_t UPPER
cpdef size_t TITLE
cpdef size_t ASCII
cpdef size_t OFT_LOWER
cpdef size_t OFT_TITLE
cpdef size_t OFT_UPPER
cpdef size_t PUNCT
cpdef size_t CONJ
cpdef size_t NUM
cpdef size_t N
cpdef size_t DET
cpdef size_t ADP
cpdef size_t ADJ
cpdef size_t ADV
cpdef size_t VERB
cpdef size_t NOUN
cpdef size_t PDT
cpdef size_t POS
cpdef size_t PRON
cpdef size_t PRT
cdef class English(spacy.Language): cdef class English(spacy.Language):
cdef int find_split(self, unicode word) cdef int find_split(self, unicode word)
cdef LatinWord new_lexeme(self, unicode string)
cdef English EN cdef English EN
@ -17,4 +40,3 @@ cdef English EN
cpdef Word lookup(unicode word) cpdef Word lookup(unicode word)
cpdef list tokenize(unicode string) cpdef list tokenize(unicode string)
cpdef unicode unhash(StringHash hash_value)

View File

@ -43,9 +43,85 @@ from libc.stdint cimport uint64_t
cimport spacy cimport spacy
# Python-readable flag constants --- can't read an enum from Python
# Don't want to manually assign these numbers, or we'll insert one and have to
# change them all.
# Don't use "i", as we don't want it in the global scope!
cdef size_t __i = 0
ALPHA = __i; i += 1
DIGIT = __i; __i += 1
PUNCT = __i; __i += 1
SPACE = __i; __i += 1
LOWER = __i; __i += 1
UPPER = __i; __i += 1
TITLE = __i; __i += 1
ASCII = __i; __i += 1
OFT_LOWER = __i; __i += 1
OFT_UPPER = __i; __i += 1
OFT_TITLE = __i; __i += 1
PUNCT = __i; __i += 1
CONJ = __i; __i += 1
NUM = __i; __i += 1
X = __i; __i += 1
DET = __i; __i += 1
ADP = __i; __i += 1
ADJ = __i; __i += 1
ADV = __i; __i += 1
VERB = __i; __i += 1
NOUN = __i; __i += 1
PDT = __i; __i += 1
POS = __i; __i += 1
PRON = __i; __i += 1
PRT = __i; __i += 1
# These are for the string views
__i = 0
SIC = __i; __i += 1
CANON_CASED = __i; __i += 1
NON_SPARSE = __i; __i += 1
SHAPE = __i; __i += 1
NR_STRING_VIEWS = __i
def get_string_views(unicode string, lexeme):
views = ['' for _ in range(NR_STRING_VIEWS)]
views[SIC] = string
views[CANON_CASED] = canonicalize_case(string, lexeme)
views[SHAPE] = get_string_shape(string)
views[NON_SPARSE] = get_non_sparse(string, views[CANON_CASED], views[SHAPE],
lexeme)
return views
def set_orth_flags(unicode string, flags_t flags)
setters = [
(ALPHA, is_alpha),
(DIGIT, is_digit),
(PUNCT, is_punct),
(SPACE, is_space),
(LOWER, is_lower),
(UPPER, is_upper),
(SPACE, is_space)
]
for bit, setter in setters:
if setter(string):
flags |= 1 << bit
return flags
cdef class English(spacy.Language): cdef class English(spacy.Language):
cdef LatinWord new_lexeme(self, unicode string): cdef Lexeme new_lexeme(self, unicode string, cluster=0, prob=0, case_stats=None,
return LatinWord(string) tag_freqs=None):
return Lexeme(s, length, views, prob=prob, cluster=cluster,
flags=self.get_flags(string))
cdef int find_split(self, unicode word): cdef int find_split(self, unicode word):
cdef size_t length = len(word) cdef size_t length = len(word)
@ -101,7 +177,7 @@ cpdef list tokenize(unicode string):
return EN.tokenize(string) return EN.tokenize(string)
cpdef Word lookup(unicode string): cpdef Lexeme lookup(unicode string):
"""Retrieve (or create, if not found) a Lexeme for a string, and return its ID. """Retrieve (or create, if not found) a Lexeme for a string, and return its ID.
Properties of the Lexeme are accessed by passing LexID to the accessor methods. Properties of the Lexeme are accessed by passing LexID to the accessor methods.
@ -116,23 +192,6 @@ cpdef Word lookup(unicode string):
return EN.lookup(string) return EN.lookup(string)
cpdef unicode unhash(StringHash hash_value):
"""Retrieve a string from a hash value. Mostly used for testing.
In general you should avoid computing with strings, as they are slower than
the intended ID-based usage. However, strings can be recovered if necessary,
although no control is taken for hash collisions.
Args:
hash_value (StringHash): The hash of a string, returned by Python's hash()
function.
Returns:
string (unicode): A unicode string that hashes to the hash_value.
"""
return EN.unhash(hash_value)
def add_string_views(view_funcs): def add_string_views(view_funcs):
"""Add a string view to existing and previous lexical entries. """Add a string view to existing and previous lexical entries.
@ -150,16 +209,19 @@ def load_clusters(location):
""" """
pass pass
def load_unigram_probs(location): def load_unigram_probs(location):
"""Load unigram probabilities. """Load unigram probabilities.
""" """
pass pass
def load_case_stats(location): def load_case_stats(location):
"""Load case stats. """Load case stats.
""" """
pass pass
def load_tag_stats(location): def load_tag_stats(location):
"""Load tag statistics. """Load tag statistics.
""" """

View File

@ -1,16 +1,12 @@
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t
from libc.stdint cimport uint64_t from libc.stdint cimport uint64_t
from spacy.word cimport Word from spacy.word cimport Lexeme
ctypedef uint32_t StringHash
cdef class Language: cdef class Language:
cdef object name cdef object name
cdef dict chunks cdef dict blobs
cdef dict vocab cdef dict lexicon
cdef dict bacov
cpdef list tokenize(self, unicode text) cpdef list tokenize(self, unicode text)
@ -20,8 +16,5 @@ cdef class Language:
cdef list new_chunk(self, unicode string, list substrings) cdef list new_chunk(self, unicode string, list substrings)
cdef Word new_lexeme(self, unicode lex) cdef Word new_lexeme(self, unicode lex)
cpdef unicode unhash(self, StringHash hashed)
cpdef list find_substrings(self, unicode chunk) cpdef list find_substrings(self, unicode chunk)
cdef int find_split(self, unicode word) cdef int find_split(self, unicode word)
cdef int set_orth(self, unicode string, Word word)

View File

@ -15,16 +15,13 @@ from libc.stdlib cimport calloc, free
from . import util from . import util
from os import path from os import path
TAGS = {}
DIST_FLAGS = {}
cdef class Language: cdef class Language:
view_funcs = [] view_funcs = []
def __cinit__(self, name): def __cinit__(self, name):
self.name = name self.name = name
self.bacov = {} self.blobs = {}
self.chunks = {} self.lexicon = {}
self.vocab = {}
self.load_tokenization(util.read_tokenization(name)) self.load_tokenization(util.read_tokenization(name))
self.load_dist_info(util.read_dist_info(name)) self.load_dist_info(util.read_dist_info(name))
@ -37,26 +34,26 @@ cdef class Language:
string (unicode): The string to split. string (unicode): The string to split.
Returns: Returns:
tokens (Tokens): A Tokens object. tokens (list): A list of Lexeme objects.
""" """
cdef list chunk cdef list blob
cdef list tokens = [] cdef list tokens = []
cdef size_t length = len(string) cdef size_t length = len(string)
cdef size_t start = 0 cdef size_t start = 0
cdef size_t i = 0 cdef size_t i = 0
for c in string: for c in string:
if _is_whitespace(c): if c == ' ':
if start < i: if start < i:
chunk = self.lookup_chunk(string[start:i]) blob = self.lookup_blob(string[start:i])
tokens.extend(chunk) tokens.extend(blob)
start = i + 1 start = i + 1
i += 1 i += 1
if start < i: if start < i:
chunk = self.lookup_chunk(string[start:]) chunk = self.lookup_blob(string[start:])
tokens.extend(chunk) tokens.extend(chunk)
return tokens return tokens
cdef Word lookup(self, unicode string): cdef Lexeme lookup(self, unicode string):
assert len(string) != 0 assert len(string) != 0
cdef Word word cdef Word word
if string in self.vocab: if string in self.vocab:
@ -65,28 +62,26 @@ cdef class Language:
word = self.new_lexeme(string) word = self.new_lexeme(string)
return word return word
cdef list lookup_chunk(self, unicode string): cdef list lookup_blob(self, unicode string):
cdef list chunk cdef list chunk
cdef size_t chunk_id cdef size_t blob_id
if string in self.chunks: if string in self.blobs:
chunk = self.chunks[string] blob = self.blobs[string]
else: else:
chunk = self.new_chunk(string, self.find_substrings(string)) blob = self.new_blob(string, self.find_substrings(string))
return chunk return chunk
cdef list new_chunk(self, unicode string, list substrings): cdef list new_blob(self, unicode string, list substrings):
chunk = [] blob = []
for i, substring in enumerate(substrings): for i, substring in enumerate(substrings):
chunk.append(self.lookup(substring)) blob.append(self.lookup(substring))
self.chunks[string] = chunk self.blobs[string] = chunk
return chunk return blob
cdef Word new_lexeme(self, unicode string): cdef Word new_lexeme(self, unicode string):
string_views = [view_func(string) for view_func in self.view_funcs] # TODO
word = Word(string.encode('utf8'), string_views) #lexeme = Lexeme(string.encode('utf8'), string_views)
self.bacov[word.lex] = string #return lexeme
self.vocab[string] = word
return word
""" """
def add_view_funcs(self, list view_funcs): def add_view_funcs(self, list view_funcs):
@ -112,11 +107,7 @@ cdef class Language:
self.bacov[hashed] = view self.bacov[hashed] = view
""" """
cpdef unicode unhash(self, StringHash hash_value): cpdef list find_substrings(self, unicode blob):
'''Fetch a string from the reverse index, given its hash value.'''
return self.bacov[hash_value]
cpdef list find_substrings(self, unicode chunk):
"""Find how to split a chunk into substrings. """Find how to split a chunk into substrings.
This method calls find_split repeatedly. Most languages will want to This method calls find_split repeatedly. Most languages will want to
@ -129,21 +120,18 @@ cdef class Language:
substrings (list): The component substrings, e.g. [u"Mike", "'s", "!"]. substrings (list): The component substrings, e.g. [u"Mike", "'s", "!"].
""" """
substrings = [] substrings = []
while chunk: while blob:
split = self.find_split(chunk) split = self.find_split(blob)
if split == 0: if split == 0:
substrings.append(chunk) substrings.append(blob)
break break
substrings.append(chunk[:split]) substrings.append(blob[:split])
chunk = chunk[split:] blob = blob[split:]
return substrings return substrings
cdef int find_split(self, unicode word): cdef int find_split(self, unicode word):
return len(word) return len(word)
cdef int set_orth(self, unicode string, Word word):
pass
def load_tokenization(self, token_rules): def load_tokenization(self, token_rules):
'''Load special-case tokenization rules. '''Load special-case tokenization rules.
@ -178,22 +166,3 @@ cdef class Language:
w.dist_flags |= DIST_FLAGS[flag] w.dist_flags |= DIST_FLAGS[flag]
for tag in word_dist.tagdict: for tag in word_dist.tagdict:
w.possible_tags |= TAGS[tag] w.possible_tags |= TAGS[tag]
cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
if c == ' ':
return True
elif c == '\n':
return True
elif c == '\t':
return True
else:
return False
#cdef inline int _extend(Tokens tokens, Lexeme** chunk) nogil:
# cdef size_t i = 0
# while chunk[i] != NULL:
# tokens.vctr[0].push_back(<Lexeme_addr>chunk[i])
# tokens.length += 1
# i += 1

View File

@ -1,32 +0,0 @@
cdef enum OrthFlag:
IS_ALPHA
IS_DIGIT
IS_PUNCT
IS_SPACE
IS_LOWER
IS_UPPER
IS_TITLE
IS_ASCII
cdef enum:
NORM
SHAPE
LAST3
from spacy.lexeme cimport LexID
from spacy.lexeme cimport StringHash
cpdef bint is_alpha(LexID lex_id) except *
cpdef bint is_digit(LexID lex_id) except *
cpdef bint is_punct(LexID lex_id) except *
cpdef bint is_space(LexID lex_id) except *
cpdef bint is_lower(LexID lex_id) except *
cpdef bint is_upper(LexID lex_id) except *
cpdef bint is_title(LexID lex_id) except *
cpdef bint is_ascii(LexID lex_id) except *
cpdef StringHash norm_of(LexID lex_id) except 0
cpdef StringHash shape_of(LexID lex_id) except 0
cpdef StringHash last3_of(LexID lex_id) except 0

View File

@ -1,211 +0,0 @@
# cython: embedsignature=True
from __future__ import unicode_literals
from spacy.lexeme cimport Lexeme
def get_normalized(unicode word):
"""Todo.
Args:
word (unicode)
Returns:
normalized (unicode)
"""
if word.isalpha() and word.islower():
return word
else:
return get_word_shape(word)
def get_word_shape(unicode word):
"""Todo.
Args:
word (unicode)
Returns:
shape (unicode)
"""
cdef size_t length = len(word)
shape = ""
last = ""
shape_char = ""
seq = 0
for c in word:
if c.isalpha():
if c.isupper():
shape_char = "X"
else:
shape_char = "x"
elif c.isdigit():
shape_char = "d"
else:
shape_char = c
if shape_char == last:
seq += 1
else:
seq = 0
last = shape_char
if seq < 3:
shape += shape_char
assert shape
return shape
cpdef unicode get_last3(unicode string):
return string[-3:]
cpdef bint is_alpha(LexID lex_id) except *:
"""Check whether all characters in the word's string are alphabetic.
Should match the :py:func:`unicode.isalpha()` function.
>>> is_alpha(lookup(u'Hello'))
True
>>> is_alpha(lookup(u'العرب'))
True
>>> is_alpha(lookup(u'10'))
False
"""
return (<Lexeme*>lex_id).orth_flags & 1 << IS_ALPHA
cpdef bint is_digit(LexID lex_id) except *:
"""Check whether all characters in the word's string are numeric.
Should match the :py:func:`unicode.isdigit()` function.
>>> is_digit(lookup(u'10'))
True
>>> is_digit(lookup(u''))
True
>>> is_digit(lookup(u'one'))
False
"""
return (<Lexeme*>lex_id).orth_flags & 1 << IS_DIGIT
cpdef bint is_punct(LexID lex_id) except *:
"""Check whether all characters belong to a punctuation unicode data category
for a Lexeme ID.
>>> is_punct(lookup(u'.'))
True
>>> is_punct(lookup(u''))
True
>>> is_punct(lookup(u' '))
False
"""
return (<Lexeme*>lex_id).orth_flags & 1 << IS_PUNCT
cpdef bint is_space(LexID lex_id) except *:
"""Give the result of unicode.isspace() for a Lexeme ID.
>>> is_space(lookup(u'\\t'))
True
>>> is_space(lookup(u'<unicode space>'))
True
>>> is_space(lookup(u'Hi\\n'))
False
"""
return (<Lexeme*>lex_id).orth_flags & 1 << IS_SPACE
cpdef bint is_lower(LexID lex_id) except *:
"""Give the result of unicode.islower() for a Lexeme ID.
>>> is_lower(lookup(u'hi'))
True
>>> is_lower(lookup(<unicode>))
True
>>> is_lower(lookup(u'10'))
False
"""
return (<Lexeme*>lex_id).orth_flags & 1 << IS_LOWER
cpdef bint is_upper(LexID lex_id) except *:
"""Give the result of unicode.isupper() for a Lexeme ID.
>>> is_upper(lookup(u'HI'))
True
>>> is_upper(lookup(u'H10'))
True
>>> is_upper(lookup(u'10'))
False
"""
return (<Lexeme*>lex_id).orth_flags & 1 << IS_UPPER
cpdef bint is_title(LexID lex_id) except *:
"""Give the result of unicode.istitle() for a Lexeme ID.
>>> is_title(lookup(u'Hi'))
True
>>> is_title(lookup(u'Hi1'))
True
>>> is_title(lookup(u'1'))
False
"""
return (<Lexeme*>lex_id).orth_flags & 1 << IS_TITLE
cpdef bint is_ascii(LexID lex_id) except *:
"""Give the result of checking whether all characters in the string are ascii.
>>> is_ascii(lookup(u'Hi'))
True
>>> is_ascii(lookup(u' '))
True
>>> is_title(lookup(u'<unicode>'))
False
"""
return (<Lexeme*>lex_id).orth_flags & 1 << IS_ASCII
cpdef StringHash norm_of(LexID lex_id) except 0:
"""Return the hash of a "normalized" version of the string.
Normalized strings are intended to be less sparse, while still capturing
important lexical information. See :py:func:`spacy.latin.orthography.normalize_string`
for details of the normalization function.
>>> unhash(norm_of(lookupu'Hi'))
u'hi'
>>> unhash(norm_of(lookup(u'255667')))
u'shape=dddd'
>>> unhash(norm_of(lookup(u'...')))
u'...'
"""
return (<Lexeme*>lex_id).string_views[NORM]
cpdef StringHash shape_of(LexID lex_id) except 0:
"""Return the hash of a string describing the word's "orthograpgic shape".
Orthographic shapes are calculated by the :py:func:`spacy.orthography.latin.string_shape`
function. Word shape features have been found useful for NER and POS tagging,
e.g. Manning (2011)
>>> unhash(shape_of(lookupu'Hi'))
u'Xx'
>>> unhash(shape_of(lookup(u'255667')))
u'dddd'
>>> unhash(shape_of(lookup(u'...')))
u'...'
"""
cdef Lexeme* w = <Lexeme*>lex_id
return w.string_views[SHAPE]
cpdef StringHash last3_of(LexID lex_id) except 0:
'''Return the hash of string[-3:], i.e. the last three characters of the word.
>>> lex_ids = [lookup(w) for w in (u'Hello', u'!')]
>>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
[u'llo', u'!']
'''
return (<Lexeme*>lex_id).string_views[LAST3]

View File

@ -1,7 +0,0 @@
cpdef bytes to_bytes(unicode string)
cpdef unicode from_bytes(bytes string)
cpdef unicode substr(unicode string, int start, int end, size_t length)
cdef bint is_whitespace(Py_UNICODE c)

View File

@ -1,35 +0,0 @@
# cython: profile=True
cpdef bytes to_bytes(unicode string):
return string.encode('utf8')
cpdef unicode from_bytes(bytes string):
return string.decode('utf8')
cpdef unicode substr(unicode string, int start, int end, size_t length):
if end >= length:
end = -1
if start >= length:
start = 0
if start <= 0 and end < 0:
return string
elif start < 0:
start = 0
elif end < 0:
end = length
return string[start:end]
cdef bint is_whitespace(Py_UNICODE c):
# TODO: Support other unicode spaces
# https://www.cs.tut.fi/~jkorpela/chars/spaces.html
if c == u' ':
return True
elif c == u'\n':
return True
elif c == u'\t':
return True
else:
return False

View File

@ -1,18 +0,0 @@
from libcpp.vector cimport vector
from spacy.lexeme cimport LexID
from spacy.lexeme cimport Lexeme
from cython.operator cimport dereference as deref
from spacy.spacy cimport Language
cdef class Tokens:
cdef Language lang
cdef vector[LexID]* vctr
cdef size_t length
cpdef int append(self, LexID token)
cpdef int extend(self, Tokens other) except -1
cpdef object group_by(self, size_t attr)
cpdef dict count_by(self, size_t attr)

View File

@ -1,92 +0,0 @@
from cython.operator cimport dereference as deref
from cython.operator cimport preincrement as inc
from spacy.lexeme cimport Lexeme
from spacy.spacy cimport StringHash
cdef class Tokens:
def __cinit__(self, Language lang):
self.lang = lang
self.vctr = new vector[LexID]()
self.length = 0
def __dealloc__(self):
del self.vctr
def __iter__(self):
cdef vector[LexID].iterator it = self.vctr[0].begin()
while it != self.vctr[0].end():
yield deref(it)
inc(it)
def __getitem__(self, size_t idx):
return self.vctr[0].at(idx)
def __len__(self):
return self.length
cpdef int append(self, LexID token):
self.vctr[0].push_back(token)
self.length += 1
cpdef int extend(self, Tokens other) except -1:
cdef LexID el
for el in other:
self.append(el)
cpdef object group_by(self, size_t view_idx):
'''Group tokens that share the property attr into Tokens instances, and
return a list of them. Returns a tuple of three lists:
(string names, hashes, tokens)
The lists are aligned, so the ith entry in string names is the string
that the ith entry in hashes unhashes to, which the Tokens instance
is grouped by.
You can then use count_by or group_by on the Tokens
for further processing. Calling group_by and then asking the length
of the Tokens objects is equivalent to count_by, but somewhat slower.
'''
# Implementation here is working around some of the constraints in
# Cython about what type of thing can go in what type of container.
# Long story short, it's pretty hard to get a Python object like
# Tokens into a vector or array. If we really need this to run faster,
# we can be tricky and get the Python list access out of the loop. What
# we'd do is store pointers to the underlying vectors.
# So far, speed isn't mattering here.
cdef dict indices = {}
cdef list groups = []
cdef list names = []
cdef list hashes = []
cdef StringHash key
cdef LexID t
for t in self.vctr[0]:
if view_idx == 0:
key = (<Lexeme*>t).lex
else:
key = (<Lexeme*>t).string_views[view_idx - 1]
if key in indices:
groups[indices[key]].append(t)
else:
indices[key] = len(groups)
groups.append(Tokens(self.lang))
names.append(self.lang.unhash(key))
hashes.append(key)
groups[-1].append(t)
return names, hashes, groups
cpdef dict count_by(self, size_t attr):
counts = {}
cdef LexID t
cdef StringHash key
for t in self.vctr[0]:
#key = attr_of(t, attr)
key = 0
if key not in counts:
counts[key] = 0
counts[key] += 1
return counts

View File

@ -1,59 +1,25 @@
from libc.stdint cimport uint32_t from .typedefs cimport hash_t, utf8_t, flag_t, id_t
from libc.stdint cimport uint64_t
ctypedef int ClusterID
ctypedef uint32_t StringHash
ctypedef size_t LexID
ctypedef char OrthFlags
ctypedef char DistFlags
ctypedef uint64_t TagFlags
cdef enum OrthFlag: DEF MAX_FLAG = 64
IS_ALPHA
IS_DIGIT
IS_PUNCT
IS_SPACE
IS_LOWER
IS_UPPER
IS_TITLE
IS_ASCII
cdef enum: cdef class Lexeme:
NORM
SHAPE
LAST3
cdef class Word:
# NB: the readonly keyword refers to _Python_ access. The attributes are # NB: the readonly keyword refers to _Python_ access. The attributes are
# writeable from Cython. # writeable from Cython.
cdef readonly StringHash key cdef readonly id_t id
cdef readonly char** utf8_strings
cdef readonly size_t length cdef readonly size_t length
cdef readonly double prob cdef readonly double prob
cdef readonly ClusterID cluster cdef readonly size_t cluster
cdef readonly TagFlags possible_tags
cdef readonly DistFlags dist_flags
cdef readonly OrthFlags orth_flags
cpdef StringHash get_view(self, size_t i) except 0 cdef readonly utf8_t* strings
cdef readonly size_t nr_strings
cdef readonly flag_t flags
cdef class CasedWord(Word): cpdef bint check_flag(self, size_t flag_id) except *
cpdef bint can_tag(self, TagFlags flag) except * cpdef int set_flag(self, size_t flag_id) except -1
cpdef bint check_dist_flag(self, DistFlags flag) except *
cpdef bint check_orth_flag(self, OrthFlags flag) except * cpdef unicode get_string(self, size_t i) except *
cpdef id_t get_id(self, size_t i) except 0
cpdef bint is_often_titled(self) except * cpdef int add_strings(self, list strings) except -1
cpdef bint is_often_uppered(self) except *
cpdef bint is_alpha(self) except *
cpdef bint is_digit(self) except *
cpdef bint is_punct(self) except *
cpdef bint is_space(self) except *
cpdef bint is_lower(self) except *
cpdef bint is_upper(self) except *
cpdef bint is_title(self) except *
cpdef bint is_ascii(self) except *

View File

@ -4,40 +4,32 @@
from libc.stdlib cimport calloc, free from libc.stdlib cimport calloc, free
from spacy cimport flags
# Python-visible enum for POS tags
PUNCT = 0
CONJ = 1
NUM = 2
X = 3
DET = 4
ADP = 5
ADJ = 6
ADV = 7
VERB = 8
NOUN = 9
PDT = 10
POS = 11
PRON = 12
PRT = 13
DEF OFT_UPPER = 1 cdef class Lexeme:
DEF OFT_TITLE = 2
cdef class Word:
"""A lexical type. """A lexical type.
Clients should avoid instantiating Lexemes directly, and instead use get_lexeme
from a language module, e.g. spacy.en.get_lexeme . This allows us to use only
one Lexeme object per lexical type.
Attributes: Attributes:
string (bytes): id (view_id_t):
A utf8-encoded byte-string for the word. A unique ID of the word's string.
lex (StringHash): Implemented as the memory-address of the string,
A hash of the word. as we use Python's string interning to guarantee that only one copy
of each string is seen.
string (unicode):
The unicode string.
Implemented as a property; relatively expensive.
length (size_t): length (size_t):
The (unicode) length of the word. The number of unicode code-points in the string.
prob (double): prob (double):
An estimate of the word's unigram log probability. An estimate of the word's unigram log probability.
@ -60,186 +52,194 @@ cdef class Word:
while "dapple" is totally different. On the other hand, "scalable" receives while "dapple" is totally different. On the other hand, "scalable" receives
the same cluster ID as "pineapple", which is not what we'd like. the same cluster ID as "pineapple", which is not what we'd like.
""" """
def __cinit__(self, bytes string, list string_views, prob=0.0, cluster=0, def __cinit__(self, utf8_t string, size_t length, list views, prob=0.0,
orth_flags=0, dist_flags=0, possible_tags=0): cluster=0, orth_flags=0, dist_flags=0, possible_tags=0):
self.string = <char*>string self.id = <id_t>&string
self.length = len(string) self.length = length
self.views = <char**>calloc(len(string_views), sizeof(StringHash)) self.nr_strings = 0
cdef unicode view self.add_views(views)
for i in range(len(string_views)):
view = string_views[i]
self.string_views[i] = hash(view)
def __dealloc__(self): def __dealloc__(self):
free(self.string_views) free(self.views)
cpdef StringHash get_view(self, size_t i) except 0: property string:
return self.string_views[i] def __get__(self):
return self.strings[0].decode('utf8')
cpdef bint check_orth_flag(self, OrthFlags flag) except *: cpdef unicode get_view_string(self, size_t i) except *:
"""Access the value of one of the pre-computed boolean orthographic features. assert i < self.nr_strings
return self.strings[i].decode('utf8')
Meanings depend on the language-specific orthographic features being loaded. cpdef intptr_t get_view_id(self, size_t i) except 0:
The suggested features for latin-alphabet languages are: TODO assert i < self.nr_strings
""" return <string_id_t>&self.views[i]
return self.orth_flags & (1 << flag)
cpdef bint check_dist_flag(self, DistFlags flag) except *: cpdef int add_views(self, list views) except -1:
self.nr_views += len(strings)
self.views = <char**>realloc(self.views, self.nr_views * sizeof(utf8_t))
cdef unicode view
cdef bytes utf8_string
for i, view in enumerate(strings):
view = string_views[i]
utf8_string = view.encode('utf8')
# Intern strings, allowing pointer comparison
utf8_string = intern(utf8_string)
self.views[i] = utf8_string
cpdef bint check_flag(self, size_t flag_id) except *:
"""Access the value of one of the pre-computed boolean distribution features. """Access the value of one of the pre-computed boolean distribution features.
Meanings depend on the language-specific distributional features being loaded. Meanings depend on the language-specific distributional features being loaded.
The suggested features for latin-alphabet languages are: TODO The suggested features for latin-alphabet languages are: TODO
""" """
assert flag_id < flags.MAX_FLAG
return self.dist_flags & (1 << flag) return self.flags & (1 << flag_id)
cpdef bint can_tag(self, TagFlags flag) except *: cpdef int set_flag(self, size_t flag_id) except -1:
"""Check whether the word often receives a particular tag in a large text assert flag_id < flags.MAX_FLAG
corpus. "Often" is chosen by heuristic. self.flags |= (1 << flag_id)
"""
return self.possible_tags & (1 << flag)
cdef class CasedWord(Word): #
def __cinit__(self, bytes string): #cdef class CasedWord(Word):
string_views = [get_normaized(string), get_word_shape(string), string[-3:]] # def __cinit__(self, bytes string, list views):
Word.__cinit__(self, string, string_views) # Word.__cinit__(self, string, string_views)
#
cpdef bint is_often_uppered(self) except *: # cpdef bint is_often_uppered(self) except *:
'''Check the OFT_UPPER distributional flag for the word. # '''Check the OFT_UPPER distributional flag for the word.
#
The OFT_UPPER flag records whether a lower-cased version of the word # The OFT_UPPER flag records whether a lower-cased version of the word
is found in all-upper case frequently in a large sample of text, where # is found in all-upper case frequently in a large sample of text, where
"frequently" is defined as P >= 0.95 (chosen for high mutual information for # "frequently" is defined as P >= 0.95 (chosen for high mutual information for
POS tagging). # POS tagging).
#
Case statistics are estimated from a large text corpus. Estimates are read # Case statistics are estimated from a large text corpus. Estimates are read
from data/en/case_stats, and can be replaced using spacy.en.load_case_stats. # from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
#
>>> is_often_uppered(lookup(u'nato')) # >>> is_often_uppered(lookup(u'nato'))
True # True
>>> is_often_uppered(lookup(u'the')) # >>> is_often_uppered(lookup(u'the'))
False # False
''' # '''
return self.dist_flags & (1 << OFT_UPPER) # return self.dist_flags & (1 << OFT_UPPER)
#
#
cpdef bint is_often_titled(self) except *: # cpdef bint is_often_titled(self) except *:
'''Check the OFT_TITLE distributional flag for the word. # '''Check the OFT_TITLE distributional flag for the word.
#
The OFT_TITLE flag records whether a lower-cased version of the word # The OFT_TITLE flag records whether a lower-cased version of the word
is found title-cased (see string.istitle) frequently in a large sample of text, # is found title-cased (see string.istitle) frequently in a large sample of text,
where "frequently" is defined as P >= 0.3 (chosen for high mutual information for # where "frequently" is defined as P >= 0.3 (chosen for high mutual information for
POS tagging). # POS tagging).
#
Case statistics are estimated from a large text corpus. Estimates are read # Case statistics are estimated from a large text corpus. Estimates are read
from data/en/case_stats, and can be replaced using spacy.en.load_case_stats. # from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
#
>>> is_oft_upper(lookup(u'john')) # >>> is_oft_upper(lookup(u'john'))
True # True
>>> is_oft_upper(lookup(u'Bill')) # >>> is_oft_upper(lookup(u'Bill'))
False # False
''' # '''
return self.dist_flags & (1 << OFT_TITLE) # return self.dist_flags & (1 << OFT_TITLE)
#
#
cpdef bint is_alpha(self) except *: # cpdef bint is_alpha(self) except *:
"""Check whether all characters in the word's string are alphabetic. # """Check whether all characters in the word's string are alphabetic.
#
Should match the :py:func:`unicode.isalpha()` function. # Should match the :py:func:`unicode.isalpha()` function.
#
>>> is_alpha(lookup(u'Hello')) # >>> is_alpha(lookup(u'Hello'))
True # True
>>> is_alpha(lookup(u'العرب')) # >>> is_alpha(lookup(u'العرب'))
True # True
>>> is_alpha(lookup(u'10')) # >>> is_alpha(lookup(u'10'))
False # False
""" # """
return self.orth_flags & 1 << IS_ALPHA # return self.orth_flags & 1 << IS_ALPHA
#
cpdef bint is_digit(self) except *: # cpdef bint is_digit(self) except *:
"""Check whether all characters in the word's string are numeric. # """Check whether all characters in the word's string are numeric.
#
Should match the :py:func:`unicode.isdigit()` function. # Should match the :py:func:`unicode.isdigit()` function.
#
>>> is_digit(lookup(u'10')) # >>> is_digit(lookup(u'10'))
True # True
>>> is_digit(lookup(u'')) # >>> is_digit(lookup(u''))
True # True
>>> is_digit(lookup(u'one')) # >>> is_digit(lookup(u'one'))
False # False
""" # """
return self.orth_flags & 1 << IS_DIGIT # return self.orth_flags & 1 << IS_DIGIT
#
cpdef bint is_punct(self) except *: # cpdef bint is_punct(self) except *:
"""Check whether all characters belong to a punctuation unicode data category # """Check whether all characters belong to a punctuation unicode data category
for a Lexeme ID. # for a Lexeme ID.
#
>>> is_punct(lookup(u'.')) # >>> is_punct(lookup(u'.'))
True # True
>>> is_punct(lookup(u'')) # >>> is_punct(lookup(u'⁒'))
True # True
>>> is_punct(lookup(u' ')) # >>> is_punct(lookup(u' '))
False # False
""" # """
return self.orth_flags & 1 << IS_PUNCT # return self.orth_flags & 1 << IS_PUNCT
#
cpdef bint is_space(self) except *: # cpdef bint is_space(self) except *:
"""Give the result of unicode.isspace() for a Lexeme ID. # """Give the result of unicode.isspace() for a Lexeme ID.
#
>>> is_space(lookup(u'\\t')) # >>> is_space(lookup(u'\\t'))
True # True
>>> is_space(lookup(u'<unicode space>')) # >>> is_space(lookup(u'<unicode space>'))
True # True
>>> is_space(lookup(u'Hi\\n')) # >>> is_space(lookup(u'Hi\\n'))
False # False
""" # """
return self.orth_flags & 1 << IS_SPACE # return self.orth_flags & 1 << IS_SPACE
#
cpdef bint is_lower(self) except *: # cpdef bint is_lower(self) except *:
"""Give the result of unicode.islower() for a Lexeme ID. # """Give the result of unicode.islower() for a Lexeme ID.
#
>>> is_lower(lookup(u'hi')) # >>> is_lower(lookup(u'hi'))
True # True
>>> is_lower(lookup(<unicode>)) # >>> is_lower(lookup(<unicode>))
True # True
>>> is_lower(lookup(u'10')) # >>> is_lower(lookup(u'10'))
False # False
""" # """
return self.orth_flags & 1 << IS_LOWER # return self.orth_flags & 1 << IS_LOWER
#
cpdef bint is_upper(self) except *: # cpdef bint is_upper(self) except *:
"""Give the result of unicode.isupper() for a Lexeme ID. # """Give the result of unicode.isupper() for a Lexeme ID.
#
>>> is_upper(lookup(u'HI')) # >>> is_upper(lookup(u'HI'))
True # True
>>> is_upper(lookup(u'H10')) # >>> is_upper(lookup(u'H10'))
True # True
>>> is_upper(lookup(u'10')) # >>> is_upper(lookup(u'10'))
False # False
""" # """
return self.orth_flags & 1 << IS_UPPER # return self.orth_flags & 1 << IS_UPPER
#
cpdef bint is_title(self) except *: # cpdef bint is_title(self) except *:
"""Give the result of unicode.istitle() for a Lexeme ID. # """Give the result of unicode.istitle() for a Lexeme ID.
#
>>> is_title(lookup(u'Hi')) # >>> is_title(lookup(u'Hi'))
True # True
>>> is_title(lookup(u'Hi1')) # >>> is_title(lookup(u'Hi1'))
True # True
>>> is_title(lookup(u'1')) # >>> is_title(lookup(u'1'))
False # False
""" # """
return self.orth_flags & 1 << IS_TITLE # return self.orth_flags & 1 << IS_TITLE
#
cpdef bint is_ascii(self) except *: # cpdef bint is_ascii(self) except *:
"""Give the result of checking whether all characters in the string are ascii. # """Give the result of checking whether all characters in the string are ascii.
#
>>> is_ascii(lookup(u'Hi')) # >>> is_ascii(lookup(u'Hi'))
True # True
>>> is_ascii(lookup(u' ')) # >>> is_ascii(lookup(u' '))
True # True
>>> is_title(lookup(u'<unicode>')) # >>> is_title(lookup(u'<unicode>'))
False # False
""" # """
return self.orth_flags & 1 << IS_ASCII # return self.orth_flags & 1 << IS_ASCII