* Broken version being refactored for docs

This commit is contained in:
Matthew Honnibal 2014-08-20 13:39:39 +02:00
parent 5fddb8d165
commit a78ad4152d
8 changed files with 196 additions and 224 deletions

View File

@ -10,6 +10,7 @@ from spacy.tokens cimport Tokens
cdef class English(spacy.Language):
cdef int find_split(self, unicode word)
cdef int set_orth(self, unicode word, Lexeme* lex) except -1
cdef English EN

View File

@ -1,7 +1,8 @@
# cython: profile=True
'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,
so that strings can be retrieved from hashes. Use 64-bit hash values and
boldly assume no collisions.
# cython: embedsignature=True
'''Tokenize English text, allowing some differences from the Penn Treebank
tokenization, e.g. for email addresses, URLs, etc. Use en_ptb if full PTB
compatibility is the priority.
'''
from __future__ import unicode_literals
@ -9,14 +10,17 @@ from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint64_t
from libcpp.vector cimport vector
from spacy.string_tools cimport substr
from . import util
cimport spacy
from spacy.orthography.latin cimport *
cdef class English(spacy.Language):
cdef int set_orth(self, unicode word, Lexeme* lex) except -1:
pass
cdef int find_split(self, unicode word):
cdef size_t length = len(word)
cdef int i = 0
@ -26,17 +30,17 @@ cdef class English(spacy.Language):
if word.endswith("'s") and length >= 3:
return length - 2
# Leading punctuation
if is_punct(word, 0, length):
if check_punct(word, 0, length):
return 1
elif length >= 1:
# Split off all trailing punctuation characters
i = 0
while i < length and not is_punct(word, i, length):
while i < length and not check_punct(word, i, length):
i += 1
return i
cdef bint is_punct(unicode word, size_t i, size_t length):
cdef bint check_punct(unicode word, size_t i, size_t length):
# Don't count appostrophes as punct if the next char is a letter
if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
return i == 0
@ -55,14 +59,52 @@ EN = English('en')
cpdef Tokens tokenize(unicode string):
"""Tokenize a string.
Wraps EN.tokenize, where EN is an instance of the class English. The global
variable manages the vocabulary, and memoizes tokenization rules.
Args:
string (unicode): The string to be split. Must be unicode, not bytes.
Returns:
tokens (Tokens): A Tokens instance, managing a vector of pointers to
Lexeme structs. The Tokens instance supports sequence interfaces,
but also offers a range of sequence-level operations, which are computed
efficiently in Cython-space.
"""
return EN.tokenize(string)
cpdef Lexeme_addr lookup(unicode string) except 0:
"""Retrieve (or create) a Lexeme for a string.
Returns a Lexeme ID, which can be used via the accessor
methods in spacy.lexeme
Args:
string (unicode): The string to be looked up. Must be unicode, not bytes.
Returns:
LexemeID (size_t): An unsigned integer that allows the Lexeme to be retrieved.
The LexemeID is really a memory address, making dereferencing it essentially
free.
"""
return <Lexeme_addr>EN.lookup(string)
cpdef unicode unhash(StringHash hash_value):
"""Retrieve a string from a hash value. Mostly used for testing.
In general you should avoid computing with strings, as they are slower than
the intended ID-based usage. However, strings can be recovered if necessary,
although no control is taken for hash collisions.
Args:
hash_value (uint32_t): The hash of a string, returned by Python's hash()
function.
Returns:
string (unicode): A unicode string that hashes to the hash_value.
"""
return EN.unhash(hash_value)

View File

@ -1,83 +1,34 @@
from libc.stdint cimport uint32_t
from libc.stdint cimport uint64_t
# Put these above import to avoid circular import problem
ctypedef int ClusterID
ctypedef uint32_t StringHash
ctypedef size_t Lexeme_addr
ctypedef char Bits8
ctypedef uint64_t Bits64
cdef enum OrthFlag:
IS_ALPHA
IS_DIGIT
IS_PUNCT
IS_WHITE
IS_LOWER
IS_UPPER
IS_TITLE
IS_ASCII
cdef enum DistFlag:
OFT_UPPER
OFT_TITLE
DIST_FLAG3
DIST_FLAG4
DIST_FLAG5
DIST_FLAG6
DIST_FLAG7
DIST_FLAG8
cdef struct Orthography:
StringHash shape
StringHash norm
StringHash last3
Bits8 flags
cdef struct Distribution:
double prob
ClusterID cluster
Bits64 tagdict
Bits8 flags
ctypedef size_t LexID
ctypedef char OrthFlags
ctypedef char DistFlags
ctypedef uint64_t TagFlags
cdef struct Lexeme:
StringHash lex
char* string
size_t length
StringHash lex
Orthography orth # Extra orthographic views
Distribution dist # Distribution info
double prob
ClusterID cluster
TagFlags possible_tags
DistFlags dist_flags
OrthFlags orth_flags
StringHash* string_views
cdef Lexeme BLANK_WORD = Lexeme(NULL, 0, 0,
Orthography(0, 0, 0, 0),
Distribution(0.0, 0, 0, 0)
)
cpdef char first_of(LexID lex_id) except 0
cpdef size_t length_of(LexID lex_id) except 0
cpdef double prob_of(LexID lex_id) except 0
cpdef ClusterID cluster_of(LexID lex_id) except 0
cpdef bint check_tag_flag(LexID lex, TagFlags flag) except *
cpdef bint check_dist_flag(LexID lex, DistFlags flag) except *
cpdef bint check_orth_flag(LexID lex, OrthFlags flag) except *
cdef enum StringAttr:
LEX
NORM
SHAPE
LAST3
LENGTH
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0
cpdef StringHash lex_of(size_t lex_id) except 0
cpdef StringHash norm_of(size_t lex_id) except 0
cpdef StringHash shape_of(size_t lex_id) except 0
cpdef StringHash last3_of(size_t lex_id) except 0
cpdef size_t length_of(size_t lex_id) except *
cpdef double prob_of(size_t lex_id) except 0
cpdef ClusterID cluster_of(size_t lex_id) except 0
cpdef bint check_orth_flag(size_t lex, OrthFlag flag) except *
cpdef bint check_dist_flag(size_t lex, DistFlag flag) except *
cpdef StringHash view_of(LexID lex_id, size_t view) except 0

View File

@ -1,32 +1,32 @@
# cython: profile=True
# cython: embedsignature=True
'''Accessors for Lexeme properties, given a lex_id, which is cast to a Lexeme*.
Mostly useful from Python-space. From Cython-space, you can just cast to
Lexeme* yourself.
'''
from __future__ import unicode_literals
from spacy.string_tools cimport substr
from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint64_t
from libcpp.vector cimport vector
from spacy.spacy cimport StringHash
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0:
if attr == LEX:
return lex_of(lex_id)
elif attr == NORM:
return norm_of(lex_id)
elif attr == SHAPE:
return shape_of(lex_id)
elif attr == LAST3:
return last3_of(lex_id)
elif attr == LENGTH:
return length_of(lex_id)
else:
raise StandardError
cpdef int set_flags(LexID lex_id, object active_flags) except *:
"""Set orthographic bit flags for a Lexeme.
Args:
lex_id (LexemeID): A reference ID for a Lexeme.
active_flags: A sequence of bits to set as True.
"""
cdef size_t flag
cdef Lexeme* w = <Lexeme*>lex_id
for flag in active_flags:
w.orth_flags |= 1 << flag
cpdef StringHash view_of(LexID lex_id, size_t view) except 0:
return (<Lexeme*>lex_id).string_views[view]
cpdef StringHash lex_of(size_t lex_id) except 0:
@ -37,42 +37,14 @@ cpdef StringHash lex_of(size_t lex_id) except 0:
delimited tokens split off. The other fields refer to properties of the
string that the lex field stores a hash of, except sic and tail.
>>> [unhash(lex_of(lex_id) for lex_id in from_string(u'Hi! world')]
>>> from spacy import en
>>> [en.unhash(lex_of(lex_id) for lex_id in en.tokenize(u'Hi! world')]
[u'Hi', u'!', u'world']
'''
return (<Lexeme*>lex_id).lex
cpdef StringHash norm_of(size_t lex_id) except 0:
'''Access the `lex' field of the Lexeme pointed to by lex_id.
The lex field is the hash of the string you would expect to get back from
a standard tokenizer, i.e. the word with punctuation and other non-whitespace
delimited tokens split off. The other fields refer to properties of the
string that the lex field stores a hash of, except sic and tail.
>>> [unhash(lex_of(lex_id) for lex_id in from_string(u'Hi! world')]
[u'Hi', u'!', u'world']
'''
return (<Lexeme*>lex_id).orth.norm
cpdef StringHash shape_of(size_t lex_id) except 0:
return (<Lexeme*>lex_id).orth.shape
cpdef StringHash last3_of(size_t lex_id) except 0:
'''Access the `last3' field of the Lexeme pointed to by lex_id, which stores
the hash of the last three characters of the word:
>>> lex_ids = [lookup(w) for w in (u'Hello', u'!')]
>>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
[u'llo', u'!']
'''
return (<Lexeme*>lex_id).orth.last3
cpdef ClusterID cluster_of(size_t lex_id) except 0:
cpdef ClusterID cluster_of(LexID lex_id) except 0:
'''Access the `cluster' field of the Lexeme pointed to by lex_id, which
gives an integer representation of the cluster ID of the word,
which should be understood as a binary address:
@ -88,10 +60,10 @@ cpdef ClusterID cluster_of(size_t lex_id) except 0:
while "dapple" is totally different. On the other hand, "scalable" receives
the same cluster ID as "pineapple", which is not what we'd like.
'''
return (<Lexeme*>lex_id).dist.cluster
return (<Lexeme*>lex_id).cluster
cpdef Py_UNICODE first_of(size_t lex_id):
cpdef char first_of(size_t lex_id) except 0:
'''Access the `first' field of the Lexeme pointed to by lex_id, which
stores the first character of the lex string of the word.
@ -99,10 +71,10 @@ cpdef Py_UNICODE first_of(size_t lex_id):
>>> unhash(first_of(lex_id))
u'H'
'''
return (<Lexeme*>lex_id).orth.first
return (<Lexeme*>lex_id).string[0]
cpdef size_t length_of(size_t lex_id) except *:
cpdef size_t length_of(size_t lex_id) except 0:
'''Access the `length' field of the Lexeme pointed to by lex_id, which stores
the length of the string hashed by lex_of.'''
cdef Lexeme* word = <Lexeme*>lex_id
@ -119,8 +91,10 @@ cpdef double prob_of(size_t lex_id) except 0:
>>> prob_of(lookup(u'world'))
-20.10340371976182
'''
return (<Lexeme*>lex_id).dist.prob
return (<Lexeme*>lex_id).prob
DEF OFT_UPPER = 1
DEF OFT_TITLE = 2
cpdef bint is_oft_upper(size_t lex_id):
'''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
@ -134,7 +108,7 @@ cpdef bint is_oft_upper(size_t lex_id):
>>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
True
'''
return (<Lexeme*>lex_id).dist.flags & OFT_UPPER
return (<Lexeme*>lex_id).dist_flags & (1 << OFT_UPPER)
cpdef bint is_oft_title(size_t lex_id):
@ -149,11 +123,15 @@ cpdef bint is_oft_title(size_t lex_id):
>>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
True
'''
return (<Lexeme*>lex_id).dist.flags & OFT_TITLE
return (<Lexeme*>lex_id).dist_flags & (1 << OFT_TITLE)
cpdef bint check_orth_flag(size_t lex_id, OrthFlag flag) except *:
return (<Lexeme*>lex_id).orth.flags & (1 << flag)
cpdef bint check_orth_flag(size_t lex_id, OrthFlags flag) except *:
return (<Lexeme*>lex_id).orth_flags & (1 << flag)
cpdef bint check_dist_flag(size_t lex_id, DistFlag flag) except *:
return (<Lexeme*>lex_id).dist.flags & (1 << flag)
cpdef bint check_dist_flag(size_t lex_id, DistFlags flag) except *:
return (<Lexeme*>lex_id).dist_flags & (1 << flag)
cpdef bint check_tag_flag(LexID lex_id, TagFlags flag) except *:
return (<Lexeme*>lex_id).possible_tags & (1 << flag)

View File

@ -19,8 +19,6 @@ ctypedef int ClusterID
from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport Distribution
from spacy.lexeme cimport Orthography
cdef class Language:
@ -29,7 +27,7 @@ cdef class Language:
cdef dense_hash_map[StringHash, size_t] vocab
cdef dict bacov
cdef Tokens tokenize(self, unicode text)
cpdef Tokens tokenize(self, unicode text)
cdef Lexeme* lookup(self, unicode string) except NULL
cdef Lexeme** lookup_chunk(self, unicode chunk) except NULL
@ -37,7 +35,8 @@ cdef class Language:
cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL
cdef Lexeme* new_lexeme(self, unicode lex) except NULL
cdef unicode unhash(self, StringHash hashed)
cpdef unicode unhash(self, StringHash hashed)
cpdef list find_substrings(self, unicode word)
cpdef list find_substrings(self, unicode chunk)
cdef int find_split(self, unicode word)
cdef int set_orth(self, unicode string, Lexeme* word)

View File

@ -1,4 +1,13 @@
# cython: profile=True
# cython: embedsignature=True
"""Common classes and utilities across languages.
Provides the main implementation for the spacy tokenizer. Specific languages
subclass the Language class, over-writing the tokenization rules as necessary.
Special-case tokenization rules are read from data/<lang>/tokenization .
"""
from __future__ import unicode_literals
from libc.stdlib cimport calloc, free
@ -6,54 +15,13 @@ from libcpp.pair cimport pair
from cython.operator cimport dereference as deref
from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport BLANK_WORD
from spacy.string_tools cimport substr
from spacy.lexeme cimport LexID
from . import util
from os import path
DIST_FLAGS = {}
TAGS = {}
def get_normalized(unicode lex):
if lex.isalpha() and lex.islower():
return lex
else:
return get_word_shape(lex)
def get_word_shape(unicode lex):
cdef size_t length = len(lex)
shape = ""
last = ""
shape_char = ""
seq = 0
for c in lex:
if c.isalpha():
if c.isupper():
shape_char = "X"
else:
shape_char = "x"
elif c.isdigit():
shape_char = "d"
else:
shape_char = c
if shape_char == last:
seq += 1
else:
seq = 0
last = shape_char
if seq < 3:
shape += shape_char
assert shape
return shape
def set_orth_flags(lex):
return 0
DIST_FLAGS = {}
cdef class Language:
def __cinit__(self, name):
@ -64,9 +32,19 @@ cdef class Language:
self.chunks.set_empty_key(0)
self.vocab.set_empty_key(0)
self.load_tokenization(util.read_tokenization(name))
#self.load_dist_info(util.read_dist_info(name))
self.load_dist_info(util.read_dist_info(name))
cdef Tokens tokenize(self, unicode string):
cpdef Tokens tokenize(self, unicode string):
"""Tokenize.
Split the string into tokens.
Args:
string (unicode): The string to split.
Returns:
tokens (Tokens): A Tokens object.
"""
cdef Lexeme** chunk
cdef Tokens tokens = Tokens(self)
cdef size_t length = len(string)
@ -85,8 +63,7 @@ cdef class Language:
return tokens
cdef Lexeme* lookup(self, unicode string) except NULL:
if len(string) == 0:
return &BLANK_WORD
assert len(string) != 0
cdef Lexeme* word = <Lexeme*>self.vocab[hash(string)]
if word == NULL:
word = self.new_lexeme(string)
@ -113,56 +90,79 @@ cdef class Language:
cdef bytes byte_string = string.encode('utf8')
word.string = <char*>byte_string
word.length = len(byte_string)
word.orth.flags = set_orth_flags(string)
cdef unicode norm = get_normalized(string)
cdef unicode shape = get_word_shape(string)
cdef unicode last3 = string[-3:]
word.lex = hash(string)
word.orth.norm = hash(norm)
word.orth.shape = hash(shape)
word.orth.last3 = hash(last3)
self.bacov[word.lex] = string
self.bacov[word.orth.norm] = norm
self.bacov[word.orth.shape] = shape
self.bacov[word.orth.last3] = last3
self.set_orth(string, word)
self.vocab[hash(string)] = <size_t>word
word.lex = hash(string)
self.bacov[word.lex] = string
self.vocab[word.lex] = <LexID>word
return word
cdef unicode unhash(self, StringHash hash_value):
cpdef unicode unhash(self, StringHash hash_value):
'''Fetch a string from the reverse index, given its hash value.'''
return self.bacov[hash_value]
cpdef list find_substrings(self, unicode word):
cpdef list find_substrings(self, unicode chunk):
"""Find how to split a chunk into substrings.
This method calls find_split repeatedly. Most languages will want to
override find_split, but it may be useful to override this instead.
Args:
chunk (unicode): The string to be split, e.g. u"Mike's!"
Returns:
substrings (list): The component substrings, e.g. [u"Mike", "'s", "!"].
"""
substrings = []
while word:
split = self.find_split(word)
while chunk:
split = self.find_split(chunk)
if split == 0:
substrings.append(word)
substrings.append(chunk)
break
substrings.append(word[:split])
word = word[split:]
substrings.append(chunk[:split])
chunk = chunk[split:]
return substrings
cdef int find_split(self, unicode word):
return len(word)
def load_tokenization(self, token_rules=None):
cdef int set_orth(self, unicode string, Lexeme* word):
pass
def load_tokenization(self, token_rules):
'''Load special-case tokenization rules.
Loads special-case tokenization rules into the Language.chunk cache,
read from data/<lang>/tokenization . The special cases are loaded before
any language data is tokenized, giving these priority. For instance,
the English tokenization rules map "ain't" to ["are", "not"].
Args:
token_rules (list): A list of (chunk, tokens) pairs, where chunk is
a string and tokens is a list of strings.
'''
for chunk, tokens in token_rules:
self.new_chunk(chunk, tokens)
def load_dist_info(self, dist_info):
'''Load distributional information for the known lexemes of the language.
The distributional information is read from data/<lang>/dist_info.json .
It contains information like the (smoothed) unigram log probability of
the word, how often the word is found upper-cased, how often the word
is found title-cased, etc.
'''
cdef unicode string
cdef dict word_dist
cdef Lexeme* w
for string, word_dist in dist_info.items():
w = self.lookup(string)
w.dist.prob = word_dist.prob
w.dist.cluster = word_dist.cluster
w.prob = word_dist.prob
w.cluster = word_dist.cluster
for flag in word_dist.flags:
w.dist.flags |= DIST_FLAGS[flag]
w.dist_flags |= DIST_FLAGS[flag]
for tag in word_dist.tagdict:
w.dist.tagdict |= TAGS[tag]
w.possible_tags |= TAGS[tag]
cdef inline bint _is_whitespace(Py_UNICODE c) nogil:

View File

@ -4,7 +4,6 @@ from spacy.lexeme cimport Lexeme
from cython.operator cimport dereference as deref
from spacy.spacy cimport Language
from spacy.lexeme cimport StringAttr
cdef class Tokens:
@ -15,5 +14,5 @@ cdef class Tokens:
cpdef int append(self, Lexeme_addr token)
cpdef int extend(self, Tokens other) except -1
cpdef object group_by(self, StringAttr attr)
cpdef dict count_by(self, StringAttr attr)
cpdef object group_by(self, size_t attr)
cpdef dict count_by(self, size_t attr)

View File

@ -3,7 +3,7 @@ from cython.operator cimport preincrement as inc
from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport attr_of, lex_of, norm_of, shape_of
#from spacy.lexeme cimport attr_of, lex_of, norm_of, shape_of
from spacy.spacy cimport StringHash
@ -37,7 +37,7 @@ cdef class Tokens:
for el in other:
self.append(el)
cpdef object group_by(self, StringAttr attr):
cpdef object group_by(self, size_t attr):
'''Group tokens that share the property attr into Tokens instances, and
return a list of them. Returns a tuple of three lists:
@ -66,7 +66,8 @@ cdef class Tokens:
cdef StringHash key
cdef Lexeme_addr t
for t in self.vctr[0]:
key = attr_of(t, attr)
#key = attr_of(t, attr)
key = 0
if key in indices:
groups[indices[key]].append(t)
else:
@ -77,12 +78,13 @@ cdef class Tokens:
groups[-1].append(t)
return names, hashes, groups
cpdef dict count_by(self, StringAttr attr):
cpdef dict count_by(self, size_t attr):
counts = {}
cdef Lexeme_addr t
cdef StringHash key
for t in self.vctr[0]:
key = attr_of(t, attr)
#key = attr_of(t, attr)
key = 0
if key not in counts:
counts[key] = 0
counts[key] += 1