* Fix bug with trailing punct on contractions. Reduced efficiency, and slightly hacky implementation.

This commit is contained in:
Matthew Honnibal 2014-09-12 18:00:42 +02:00
parent 7eab281194
commit 985bc68327
5 changed files with 50 additions and 47 deletions

View File

@ -40,3 +40,5 @@ cdef class EnglishTokens(Tokens):
cdef class English(Language): cdef class English(Language):
cdef int _split_one(self, Py_UNICODE* characters, size_t length) cdef int _split_one(self, Py_UNICODE* characters, size_t length)

View File

@ -236,7 +236,7 @@ cdef class English(Language):
fl_is_digit = Flag_IsDigit fl_is_digit = Flag_IsDigit
v_shape = View_WordShape v_shape = View_WordShape
def __cinit__(self, name, user_string_features, user_flag_features): def __cinit__(self, name, user_string_features, user_flag_features):
self.cache.set_empty_key(0) self.cache = {}
lang_data = util.read_lang_data(name) lang_data = util.read_lang_data(name)
rules, words, probs, clusters, case_stats, tag_stats = lang_data rules, words, probs, clusters, case_stats, tag_stats = lang_data
self.lexicon = lang.Lexicon(words, probs, clusters, case_stats, tag_stats, self.lexicon = lang.Lexicon(words, probs, clusters, case_stats, tag_stats,

View File

@ -9,6 +9,11 @@ from libcpp.vector cimport vector
from libc.stdint cimport uint64_t, int64_t from libc.stdint cimport uint64_t, int64_t
cdef extern from "Python.h":
cdef bint Py_UNICODE_ISSPACE(Py_UNICODE ch)
cdef bint Py_UNICODE_ISALNUM(Py_UNICODE ch)
cdef extern from "sparsehash/dense_hash_map" namespace "google": cdef extern from "sparsehash/dense_hash_map" namespace "google":
cdef cppclass dense_hash_map[K, D]: cdef cppclass dense_hash_map[K, D]:
K& key_type K& key_type
@ -52,10 +57,6 @@ cdef extern from "sparsehash/dense_hash_map" namespace "google":
D& operator[](K&) nogil D& operator[](K&) nogil
cdef struct LexList:
LexemeC* lex
LexList* tail
cdef class Lexicon: cdef class Lexicon:
cpdef readonly size_t size cpdef readonly size_t size
@ -70,13 +71,12 @@ cdef class Lexicon:
cdef class Language: cdef class Language:
cdef unicode name cdef unicode name
cdef dense_hash_map[uint64_t, size_t] cache cdef dict cache
cdef size_t cache_size
cpdef readonly Lexicon lexicon cpdef readonly Lexicon lexicon
cpdef readonly object tokens_class cpdef readonly object tokens_class
cpdef Tokens tokenize(self, unicode text) cpdef Tokens tokenize(self, unicode text)
cpdef Lexeme lookup(self, unicode text) cpdef Lexeme lookup(self, unicode text)
cdef int _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) except -1 cdef _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length)
cdef int _split_one(self, Py_UNICODE* characters, size_t length) cdef int _split_one(self, Py_UNICODE* characters, size_t length)

View File

@ -40,8 +40,7 @@ cdef class Language:
if string_features is None: if string_features is None:
string_features = [] string_features = []
self.name = name self.name = name
self.cache.set_empty_key(0) self.cache = {}
self.cache_size = 0
lang_data = read_lang_data(name) lang_data = read_lang_data(name)
rules, words, probs, clusters, case_stats, tag_stats = lang_data rules, words, probs, clusters, case_stats, tag_stats = lang_data
self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats, self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
@ -80,7 +79,6 @@ cdef class Language:
Returns: Returns:
tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs. tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
""" """
print repr(string)
cdef size_t length = len(string) cdef size_t length = len(string)
cdef Tokens tokens = self.tokens_class(length) cdef Tokens tokens = self.tokens_class(length)
if length == 0: if length == 0:
@ -92,7 +90,7 @@ cdef class Language:
cdef Py_UNICODE c cdef Py_UNICODE c
for i in range(length): for i in range(length):
c = characters[i] c = characters[i]
if c == ' ' or c == '\n' or c == '\t': if Py_UNICODE_ISSPACE(c) == 1:
if start < i: if start < i:
self._tokenize(tokens, &characters[start], i - start) self._tokenize(tokens, &characters[start], i - start)
start = i + 1 start = i + 1
@ -101,38 +99,30 @@ cdef class Language:
self._tokenize(tokens, &characters[start], i - start) self._tokenize(tokens, &characters[start], i - start)
return tokens return tokens
cdef int _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) except -1: cdef _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length):
cdef list lexemes
cdef size_t lex_addr
cdef uint64_t hashed = hash64(characters, length * sizeof(Py_UNICODE), 0) cdef uint64_t hashed = hash64(characters, length * sizeof(Py_UNICODE), 0)
cdef LexList* node = <LexList*>self.cache[hashed] if hashed in self.cache:
if node is not NULL: for lex_addr in self.cache[hashed]:
while node != NULL: tokens.push_back(<LexemeC*>lex_addr)
tokens.push_back(node.lex)
node = node.tail
return 0 return 0
node = <LexList*>calloc(1, sizeof(LexList)) lexemes = []
self.cache[hashed] = <size_t>node
cdef size_t start = 0 cdef size_t start = 0
cdef size_t split = 0 cdef size_t split = 0
while start < length: while start < length:
split = self._split_one(&characters[start], length - start) split = self._split_one(&characters[start], length - start)
node.lex = <LexemeC*>self.lexicon.get(&characters[start], split) hashed = hash64(&characters[start], split * sizeof(Py_UNICODE), 0)
tokens.push_back(node.lex) if hashed in self.cache:
start += split lexemes.extend(self.cache[hashed])
if start >= length:
break
hashed = hash64(&characters[start], (length - start) * sizeof(Py_UNICODE), 0)
node.tail = <LexList*>self.cache[hashed]
if node.tail == NULL:
node.tail = <LexList*>calloc(1, sizeof(LexList))
self.cache[hashed] = <size_t>node.tail
node = node.tail
else: else:
node = node.tail lexeme = <LexemeC*>self.lexicon.get(&characters[start], split)
while node != NULL: lexemes.append(<size_t>lexeme)
tokens.push_back(node.lex) start += split
node = node.tail for lex_addr in lexemes:
break tokens.push_back(<LexemeC*>lex_addr)
#self.cache[hashed] = lexemes
cdef int _split_one(self, Py_UNICODE* characters, size_t length): cdef int _split_one(self, Py_UNICODE* characters, size_t length):
return length return length
@ -149,17 +139,14 @@ cdef class Language:
token_rules (list): A list of (chunk, tokens) pairs, where chunk is token_rules (list): A list of (chunk, tokens) pairs, where chunk is
a string and tokens is a list of strings. a string and tokens is a list of strings.
''' '''
cdef LexList* node cdef list lexemes
cdef uint64_t hashed cdef uint64_t hashed
for string, substrings in token_rules: for string, substrings in token_rules:
hashed = hash64(<Py_UNICODE*>string, len(string) * sizeof(Py_UNICODE), 0) hashed = hash64(<Py_UNICODE*>string, len(string) * sizeof(Py_UNICODE), 0)
node = <LexList*>calloc(1, sizeof(LexList)) lexemes = []
self.cache[hashed] = <size_t>node for substring in substrings:
for substring in substrings[:-1]: lexemes.append(self.lexicon.get(<Py_UNICODE*>substring, len(substring)))
node.lex = <LexemeC*>self.lexicon.get(<Py_UNICODE*>substring, len(substring)) self.cache[hashed] = lexemes
node.tail = <LexList*>calloc(1, sizeof(LexList))
node = node.tail
node.lex = <LexemeC*>self.lexicon.get(<Py_UNICODE*>substrings[-1], len(substrings[-1]))
cdef class Lexicon: cdef class Lexicon:

View File

@ -26,6 +26,8 @@ def test_punct():
def test_digits(): def test_digits():
lex_ids = EN.tokenize('The year: 1984.') lex_ids = EN.tokenize('The year: 1984.')
assert lex_ids.string(4) == "."
assert lex_ids.string(3) == "1984"
assert len(lex_ids) == 5 assert len(lex_ids) == 5
assert lex_ids[0].string == EN.lexicon.lookup('The').string assert lex_ids[0].string == EN.lexicon.lookup('The').string
assert lex_ids[3].string == EN.lexicon.lookup('1984').string assert lex_ids[3].string == EN.lexicon.lookup('1984').string
@ -37,5 +39,17 @@ def test_contraction():
assert len(lex_ids) == 3 assert len(lex_ids) == 3
assert lex_ids[1].string == EN.lexicon.lookup("not").string assert lex_ids[1].string == EN.lexicon.lookup("not").string
lex_ids = EN.tokenize("i said don't!") lex_ids = EN.tokenize("i said don't!")
assert len(lex_ids) == 4 assert len(lex_ids) == 5
assert lex_ids[3].string == EN.lexicon.lookup('!').string assert lex_ids[4].string == EN.lexicon.lookup('!').string
def test_contraction_punct():
tokens = EN.tokenize("(can't")
assert len(tokens) == 3
tokens = EN.tokenize("`ain't")
assert len(tokens) == 3
tokens = EN.tokenize('''"isn't''')
assert len(tokens) == 3
tokens = EN.tokenize("can't!")
assert len(tokens) == 3