* Redesign proceeding

This commit is contained in:
Matthew Honnibal 2014-08-28 19:45:09 +02:00
parent fd4e61e58b
commit c282e6d5fb
6 changed files with 137 additions and 178 deletions

View File

@ -45,8 +45,71 @@ cimport lang
from spacy import orth from spacy import orth
TAG_THRESH = 0.5
UPPER_THRESH = 0.2
LOWER_THRESH = 0.5
TITLE_THRESH = 0.7
NR_FLAGS = 0
OFT_UPPER = NR_FLAGS; NR_FLAGS += 1
OFT_LOWER = NR_FLAGS; NR_FLAGS += 1
OFT_TITLE = NR_FLAGS; NR_FLAGS += 1
IS_ALPHA = NR_FLAGS; NR_FLAGS += 1
IS_DIGIT = NR_FLAGS; NR_FLAGS += 1
IS_PUNCT = NR_FLAGS; NR_FLAGS += 1
IS_SPACE = NR_FLAGS; NR_FLAGS += 1
IS_ASCII = NR_FLAGS; NR_FLAGS += 1
IS_TITLE = NR_FLAGS; NR_FLAGS += 1
IS_LOWER = NR_FLAGS; NR_FLAGS += 1
IS_UPPER = NR_FLAGS; NR_FLAGS += 1
CAN_PUNCT = NR_FLAGS; NR_FLAGS += 1
CAN_CONJ = NR_FLAGS; NR_FLAGS += 1
CAN_NUM = NR_FLAGS; NR_FLAGS += 1
CAN_DET = NR_FLAGS; NR_FLAGS += 1
CAN_ADP = NR_FLAGS; NR_FLAGS += 1
CAN_ADJ = NR_FLAGS; NR_FLAGS += 1
CAN_ADV = NR_FLAGS; NR_FLAGS += 1
CAN_VERB = NR_FLAGS; NR_FLAGS += 1
CAN_NOUN = NR_FLAGS; NR_FLAGS += 1
CAN_PDT = NR_FLAGS; NR_FLAGS += 1
CAN_POS = NR_FLAGS; NR_FLAGS += 1
CAN_PRON = NR_FLAGS; NR_FLAGS += 1
CAN_PRT = NR_FLAGS; NR_FLAGS += 1
cdef class English(Language): cdef class English(Language):
def __cinit__(self, name):
flag_funcs = [0 for _ in range(NR_FLAGS)]
flag_funcs[OFT_UPPER] = orth.oft_case('upper', UPPER_THRESH)
flag_funcs[OFT_LOWER] = orth.oft_case('lower', LOWER_THRESH)
flag_funcs[OFT_TITLE] = orth.oft_case('title', TITLE_THRESH)
flag_funcs[IS_ALPHA] = orth.is_alpha
flag_funcs[IS_DIGIT] = orth.is_digit
flag_funcs[IS_PUNCT] = orth.is_punct
flag_funcs[IS_SPACE] = orth.is_space
flag_funcs[IS_TITLE] = orth.is_title
flag_funcs[IS_LOWER] = orth.is_lower
flag_funcs[IS_UPPER] = orth.is_upper
flag_funcs[CAN_PUNCT] = orth.can_tag('PUNCT', TAG_THRESH)
flag_funcs[CAN_CONJ] = orth.can_tag('CONJ', TAG_THRESH)
flag_funcs[CAN_NUM] = orth.can_tag('NUM', TAG_THRESH)
flag_funcs[CAN_DET] = orth.can_tag('DET', TAG_THRESH)
flag_funcs[CAN_ADP] = orth.can_tag('ADP', TAG_THRESH)
flag_funcs[CAN_ADJ] = orth.can_tag('ADJ', TAG_THRESH)
flag_funcs[CAN_VERB] = orth.can_tag('VERB', TAG_THRESH)
flag_funcs[CAN_NOUN] = orth.can_tag('NOUN', TAG_THRESH)
flag_funcs[CAN_PDT] = orth.can_tag('PDT', TAG_THRESH)
flag_funcs[CAN_POS] = orth.can_tag('POS', TAG_THRESH)
flag_funcs[CAN_PRT] = orth.can_tag('PRT', TAG_THRESH)
Language.__init__(self, name, flag_funcs)
cpdef int _split_one(self, unicode word): cpdef int _split_one(self, unicode word):
cdef size_t length = len(word) cdef size_t length = len(word)
cdef int i = 0 cdef int i = 0
@ -81,48 +144,3 @@ cdef bint _check_punct(unicode word, size_t i, size_t length):
EN = English('en') EN = English('en')
# Thresholds for frequency related flags
cdef double TAG_THRESH = 0.5
cdef double LOWER_THRESH = 0.5
cdef double UPPER_THRESH = 0.3
cdef double TITLE_THRESH = 0.9
# Python-readable flag constants --- can't read an enum from Python
ALPHA = EN.lexicon.add_flag(orth.is_alpha)
DIGIT = EN.lexicon.add_flag(orth.is_digit)
PUNCT = EN.lexicon.add_flag(orth.is_punct)
SPACE = EN.lexicon.add_flag(orth.is_space)
PUNCT = EN.lexicon.add_flag(orth.is_punct)
ASCII = EN.lexicon.add_flag(orth.is_ascii)
TITLE = EN.lexicon.add_flag(orth.is_title)
LOWER = EN.lexicon.add_flag(orth.is_lower)
UPPER = EN.lexicon.add_flag(orth.is_upper)
OFT_LOWER = EN.lexicon.add_flag(orth.case_trend('lower', LOWER_THRESH))
OFT_UPPER = EN.lexicon.add_flag(orth.case_trend('upper', UPPER_THRESH))
OFT_TITLE = EN.lexicon.add_flag(orth.case_trend('title', TITLE_THRESH))
CAN_PUNCT = EN.lexicon.add_flag(orth.can_tag("PUNCT", TAG_THRESH))
CAN_CONJ = EN.lexicon.add_flag(orth.can_tag("CONJ", TAG_THRESH))
CAN_NUM = EN.lexicon.add_flag(orth.can_tag("NUM", TAG_THRESH))
CAN_N = EN.lexicon.add_flag(orth.can_tag("N", TAG_THRESH))
CAN_DET = EN.lexicon.add_flag(orth.can_tag("DET", TAG_THRESH))
CAN_ADP = EN.lexicon.add_flag(orth.can_tag("ADP", TAG_THRESH))
CAN_ADJ = EN.lexicon.add_flag(orth.can_tag("ADJ", TAG_THRESH))
CAN_ADV = EN.lexicon.add_flag(orth.can_tag("ADV", TAG_THRESH))
CAN_VERB = EN.lexicon.add_flag(orth.can_tag("VERB", TAG_THRESH))
CAN_NOUN = EN.lexicon.add_flag(orth.can_tag("NOUN", TAG_THRESH))
CAN_PDT = EN.lexicon.add_flag(orth.can_tag("PDT", TAG_THRESH))
CAN_POS = EN.lexicon.add_flag(orth.can_tag("POS", TAG_THRESH))
CAN_PRON = EN.lexicon.add_flag(orth.can_tag("PRON", TAG_THRESH))
CAN_PRT = EN.lexicon.add_flag(orth.can_tag("PRT", TAG_THRESH))
# These are the name of string transforms
SIC = EN.lexicon.add_transform(orth.sic_string)
CANON_CASED = EN.lexicon.add_transform(orth.canon_case)
SHAPE = EN.lexicon.add_transform(orth.word_shape)
NON_SPARSE = EN.lexicon.add_transform(orth.non_sparse)

View File

@ -4,14 +4,10 @@ from spacy.word cimport Lexeme
cdef class Lexicon: cdef class Lexicon:
cdef public dict probs cdef list string_features
cdef public dict clusters cdef list flag_features
cdef public dict case_stats
cdef public dict tag_stats
cdef public list flag_checkers
cdef public list string_transformers
cdef dict lexicon cdef dict _dict
cpdef Lexeme lookup(self, unicode string) cpdef Lexeme lookup(self, unicode string)
@ -26,4 +22,3 @@ cdef class Language:
cdef list _tokenize(self, unicode string) cdef list _tokenize(self, unicode string)
cpdef list _split(self, unicode string) cpdef list _split(self, unicode string)
cpdef int _split_one(self, unicode word) cpdef int _split_one(self, unicode word)

View File

@ -10,17 +10,38 @@ from __future__ import unicode_literals
from libc.stdlib cimport calloc, free from libc.stdlib cimport calloc, free
from . import util
import json import json
from os import path from os import path
from .util import read_lang_data
cdef class Language: cdef class Language:
def __cinit__(self, name): """Base class for language-specific tokenizers.
Most subclasses will override the _split or _split_one methods, which take
a string of non-whitespace characters and output a list of strings. This
function is called by _tokenize, which sits behind a cache and turns the
list of strings into Lexeme objects via the Lexicon. Most languages will not
need to override _tokenize or tokenize.
The language is supplied a list of boolean functions, used to compute flag
features. These are passed to the language's Lexicon object.
The language's name is used to look up default data-files, found in data/<name.
"""
def __cinit__(self, name, string_features=None, flag_features=None):
if flag_features is None:
flag_features = []
if string_features is None:
string_features = []
self.name = name self.name = name
self.cache = {} self.cache = {}
self.lexicon = Lexicon() lang_data = read_lang_data(name)
self.load_special_tokenization(util.read_tokenization(name)) rules, words, probs, clusters, case_stats, tag_stats = lang_data
self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
string_features, flag_features)
self.load_special_tokenization(rules)
cpdef list tokenize(self, unicode string): cpdef list tokenize(self, unicode string):
"""Tokenize a string. """Tokenize a string.
@ -37,6 +58,8 @@ cdef class Language:
Returns: Returns:
tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs. tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
""" """
if not string:
return []
cdef list tokens = [] cdef list tokens = []
cdef size_t length = len(string) cdef size_t length = len(string)
cdef size_t start = 0 cdef size_t start = 0
@ -107,85 +130,32 @@ cdef class Language:
cdef class Lexicon: cdef class Lexicon:
def __cinit__(self): def __cinit__(self, words, probs, clusters, case_stats, tag_stats,
self.flag_checkers = [] string_features, flag_features):
self.string_transformers = [] self.flag_features = flag_features
self.probs = {} self.string_features = string_features
self.clusters = {} self._dict = {}
self.case_stats = {} cdef Lexeme word
self.tag_stats = {} for string in words:
self.lexicon = {} word = Lexeme(string, probs.get(string, 0.0), clusters.get(string, 0),
case_stats.get(string, {}), tag_stats.get(string, {}),
self.string_features, self.flag_features)
self._dict[string] = word
cpdef Lexeme lookup(self, unicode string): cpdef Lexeme lookup(self, unicode string):
"""Retrieve (or create, if not found) a Lexeme for a string, and return it. """Retrieve (or create, if not found) a Lexeme for a string, and return it.
Args: Args
string (unicode): The string to be looked up. Must be unicode, not bytes. string (unicode): The string to be looked up. Must be unicode, not bytes.
Returns: Returns:
lexeme (Lexeme): A reference to a lexical type. lexeme (Lexeme): A reference to a lexical type.
""" """
assert len(string) != 0 assert len(string) != 0
if string in self.lexicon: if string in self._dict:
return self.lexicon[string] return self._dict[string]
prob = _pop_default(self.probs, string, 0.0) cdef Lexeme word = Lexeme(string, 0, 0, {}, {}, self.string_features,
cluster = _pop_default(self.clusters, string, 0.0) self.flag_features)
case_stats = _pop_default(self.case_stats, string, {}) self._dict[string] = word
tag_stats = _pop_default(self.tag_stats, string, {})
cdef Lexeme word = Lexeme(string, prob, cluster, case_stats, tag_stats,
self.flag_checkers, self.string_transformers)
self.lexicon[string] = word
return word return word
def add_flag(self, flag_checker):
cdef unicode string
cdef Lexeme word
flag_id = len(self.flag_checkers)
for string, word in self.lexicon.items():
if flag_checker(string, word.prob, {}, {}):
word.set_flag(flag_id)
self.flag_checkers.append(flag_checker)
return flag_id
def add_transform(self, string_transform):
self.string_transformers.append(string_transform)
for string, word in self.lexicon.items():
word.add_view(string_transform(string, word.prob, {}, {}))
return len(self.string_transformers) - 1
def load_probs(self, location):
"""Load unigram probabilities.
"""
# Dict mapping words to floats
self.probs = json.load(location)
cdef Lexeme word
cdef unicode string
for string, word in self.lexicon.items():
prob = _pop_default(self.probs, string, 0.0)
word.prob = prob
def load_clusters(self, location):
# TODO: Find out endianness
# Dict mapping words to ??-endian ints
self.clusters = json.load(location)
cdef Lexeme word
cdef unicode string
for string, word in self.lexicon.items():
cluster = _pop_default(self.clusters, string, 0)
word.cluster = cluster
def load_stats(self, location):
"""Load distributional stats.
"""
# Dict mapping string to dict of arbitrary stuff.
raise NotImplementedError
def _pop_default(dict d, key, default):
return d.pop(key) if key in d else default

View File

@ -7,19 +7,14 @@ DEF MAX_FLAG = 64
cdef class Lexeme: cdef class Lexeme:
# NB: the readonly keyword refers to _Python_ access. The attributes are # NB: the readonly keyword refers to _Python_ access. The attributes are
# writeable from Cython. # writeable from Cython.
cpdef readonly id_t id
cpdef readonly size_t length cpdef readonly size_t length
cpdef readonly double prob cpdef readonly double prob
cpdef readonly size_t cluster cpdef readonly size_t cluster
cdef list views cpdef readonly string
cdef size_t nr_views cpdef readonly list views
cdef readonly flag_t flags cdef readonly flag_t flags
cpdef bint check_flag(self, size_t flag_id) except * cpdef bint check_flag(self, size_t flag_id) except *
cpdef int set_flag(self, size_t flag_id) except -1 cpdef int set_flag(self, size_t flag_id) except -1
cpdef unicode get_view_string(self, size_t i)
cpdef id_t get_view_id(self, size_t i) except 0
cpdef int add_view(self, unicode view) except -1

View File

@ -49,42 +49,24 @@ cdef class Lexeme:
while "dapple" is totally different. On the other hand, "scalable" receives while "dapple" is totally different. On the other hand, "scalable" receives
the same cluster ID as "pineapple", which is not what we'd like. the same cluster ID as "pineapple", which is not what we'd like.
""" """
def __cinit__(self, unicode string, prob, cluster, case_stats, def __cinit__(self, unicode string, double prob, int cluster, dict case_stats,
tag_stats, flag_checkers, string_transformers): dict tag_stats, list string_features, list flag_features):
self.prob = prob self.prob = prob
self.cluster = cluster self.cluster = cluster
self.length = len(string) self.length = len(string)
self.id = hash(string) self.string = string
self.nr_views = len(string_transformers) for string_feature in string_features:
self.views = [] view = string_feature(string, prob, cluster, case_stats, tag_stats)
cdef unicode view
for i, string_transformer in enumerate(string_transformers):
view = string_transformer(string, prob, case_stats, tag_stats)
self.views.append(view) self.views.append(view)
for i, flag_checker in enumerate(flag_checkers): for i, flag_feature in enumerate(flag_features):
if flag_checker(string, prob, case_stats, tag_stats): if flag_feature(string, prob, case_stats, tag_stats):
self.set_flag(i) self.set_flag(i)
def __dealloc__(self): def __dealloc__(self):
pass pass
property string:
def __get__(self):
return self.views[0]
cpdef unicode get_view_string(self, size_t i):
assert i < self.nr_views
return self.views[i]
cpdef id_t get_view_id(self, size_t i) except 0:
return <id_t>hash(self.views[i])
cpdef int add_view(self, unicode view) except -1:
self.nr_views += 1
self.views.append(view)
cpdef bint check_flag(self, size_t flag_id) except *: cpdef bint check_flag(self, size_t flag_id) except *:
"""Access the value of one of the pre-computed boolean distribution features. """Access the value of one of the pre-computed boolean distribution features.

View File

@ -1,42 +1,41 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy.en import tokenize from spacy.en import EN
from spacy.en import lookup
def test_single_word(): def test_single_word():
lex_ids = tokenize(u'hello') lex_ids = EN.tokenize(u'hello')
assert lex_ids[0] == lookup(u'hello') assert lex_ids[0] == EN.lexicon.lookup(u'hello')
def test_two_words(): def test_two_words():
words = tokenize('hello possums') words = EN.tokenize('hello possums')
assert len(words) == 2 assert len(words) == 2
assert words[0] == lookup('hello') assert words[0] == EN.lexicon.lookup('hello')
assert words[0] != words[1] assert words[0] != words[1]
def test_punct(): def test_punct():
tokens = tokenize('hello, possums.') tokens = EN.tokenize('hello, possums.')
assert len(tokens) == 4 assert len(tokens) == 4
assert tokens[0].lex == lookup('hello').lex assert tokens[0].string == EN.lexicon.lookup('hello').string
assert tokens[1].lex == lookup(',').lex assert tokens[1].string == EN.lexicon.lookup(',').string
assert tokens[2].lex == lookup('possums').lex assert tokens[2].string == EN.lexicon.lookup('possums').string
assert tokens[1].lex != lookup('hello').lex assert tokens[1].string != EN.lexicon.lookup('hello').string
def test_digits(): def test_digits():
lex_ids = tokenize('The year: 1984.') lex_ids = EN.tokenize('The year: 1984.')
assert len(lex_ids) == 5 assert len(lex_ids) == 5
assert lex_ids[0].lex == lookup('The').lex assert lex_ids[0].string == EN.lexicon.lookup('The').string
assert lex_ids[3].lex == lookup('1984').lex assert lex_ids[3].string == EN.lexicon.lookup('1984').string
assert lex_ids[4].lex == lookup('.').lex assert lex_ids[4].string == EN.lexicon.lookup('.').string
def test_contraction(): def test_contraction():
lex_ids = tokenize("don't giggle") lex_ids = EN.tokenize("don't giggle")
assert len(lex_ids) == 3 assert len(lex_ids) == 3
assert lex_ids[1].lex == lookup("not").lex assert lex_ids[1].string == EN.lexicon.lookup("not").string
lex_ids = tokenize("i said don't!") lex_ids = EN.tokenize("i said don't!")
assert len(lex_ids) == 4 assert len(lex_ids) == 4
assert lex_ids[3].lex == lookup('!').lex assert lex_ids[3].string == EN.lexicon.lookup('!').string