mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 09:56:28 +03:00
* Fix bug with trailing punct on contractions. Reduced efficiency, and slightly hacky implementation.
This commit is contained in:
parent
7eab281194
commit
985bc68327
|
@ -40,3 +40,5 @@ cdef class EnglishTokens(Tokens):
|
||||||
|
|
||||||
cdef class English(Language):
|
cdef class English(Language):
|
||||||
cdef int _split_one(self, Py_UNICODE* characters, size_t length)
|
cdef int _split_one(self, Py_UNICODE* characters, size_t length)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -236,7 +236,7 @@ cdef class English(Language):
|
||||||
fl_is_digit = Flag_IsDigit
|
fl_is_digit = Flag_IsDigit
|
||||||
v_shape = View_WordShape
|
v_shape = View_WordShape
|
||||||
def __cinit__(self, name, user_string_features, user_flag_features):
|
def __cinit__(self, name, user_string_features, user_flag_features):
|
||||||
self.cache.set_empty_key(0)
|
self.cache = {}
|
||||||
lang_data = util.read_lang_data(name)
|
lang_data = util.read_lang_data(name)
|
||||||
rules, words, probs, clusters, case_stats, tag_stats = lang_data
|
rules, words, probs, clusters, case_stats, tag_stats = lang_data
|
||||||
self.lexicon = lang.Lexicon(words, probs, clusters, case_stats, tag_stats,
|
self.lexicon = lang.Lexicon(words, probs, clusters, case_stats, tag_stats,
|
||||||
|
|
|
@ -9,6 +9,11 @@ from libcpp.vector cimport vector
|
||||||
from libc.stdint cimport uint64_t, int64_t
|
from libc.stdint cimport uint64_t, int64_t
|
||||||
|
|
||||||
|
|
||||||
|
cdef extern from "Python.h":
|
||||||
|
cdef bint Py_UNICODE_ISSPACE(Py_UNICODE ch)
|
||||||
|
cdef bint Py_UNICODE_ISALNUM(Py_UNICODE ch)
|
||||||
|
|
||||||
|
|
||||||
cdef extern from "sparsehash/dense_hash_map" namespace "google":
|
cdef extern from "sparsehash/dense_hash_map" namespace "google":
|
||||||
cdef cppclass dense_hash_map[K, D]:
|
cdef cppclass dense_hash_map[K, D]:
|
||||||
K& key_type
|
K& key_type
|
||||||
|
@ -52,10 +57,6 @@ cdef extern from "sparsehash/dense_hash_map" namespace "google":
|
||||||
D& operator[](K&) nogil
|
D& operator[](K&) nogil
|
||||||
|
|
||||||
|
|
||||||
cdef struct LexList:
|
|
||||||
LexemeC* lex
|
|
||||||
LexList* tail
|
|
||||||
|
|
||||||
cdef class Lexicon:
|
cdef class Lexicon:
|
||||||
cpdef readonly size_t size
|
cpdef readonly size_t size
|
||||||
|
|
||||||
|
@ -70,13 +71,12 @@ cdef class Lexicon:
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
cdef unicode name
|
cdef unicode name
|
||||||
cdef dense_hash_map[uint64_t, size_t] cache
|
cdef dict cache
|
||||||
cdef size_t cache_size
|
|
||||||
cpdef readonly Lexicon lexicon
|
cpdef readonly Lexicon lexicon
|
||||||
cpdef readonly object tokens_class
|
cpdef readonly object tokens_class
|
||||||
|
|
||||||
cpdef Tokens tokenize(self, unicode text)
|
cpdef Tokens tokenize(self, unicode text)
|
||||||
cpdef Lexeme lookup(self, unicode text)
|
cpdef Lexeme lookup(self, unicode text)
|
||||||
|
|
||||||
cdef int _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) except -1
|
cdef _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length)
|
||||||
cdef int _split_one(self, Py_UNICODE* characters, size_t length)
|
cdef int _split_one(self, Py_UNICODE* characters, size_t length)
|
||||||
|
|
|
@ -40,8 +40,7 @@ cdef class Language:
|
||||||
if string_features is None:
|
if string_features is None:
|
||||||
string_features = []
|
string_features = []
|
||||||
self.name = name
|
self.name = name
|
||||||
self.cache.set_empty_key(0)
|
self.cache = {}
|
||||||
self.cache_size = 0
|
|
||||||
lang_data = read_lang_data(name)
|
lang_data = read_lang_data(name)
|
||||||
rules, words, probs, clusters, case_stats, tag_stats = lang_data
|
rules, words, probs, clusters, case_stats, tag_stats = lang_data
|
||||||
self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
|
self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
|
||||||
|
@ -80,7 +79,6 @@ cdef class Language:
|
||||||
Returns:
|
Returns:
|
||||||
tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
|
tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
|
||||||
"""
|
"""
|
||||||
print repr(string)
|
|
||||||
cdef size_t length = len(string)
|
cdef size_t length = len(string)
|
||||||
cdef Tokens tokens = self.tokens_class(length)
|
cdef Tokens tokens = self.tokens_class(length)
|
||||||
if length == 0:
|
if length == 0:
|
||||||
|
@ -92,7 +90,7 @@ cdef class Language:
|
||||||
cdef Py_UNICODE c
|
cdef Py_UNICODE c
|
||||||
for i in range(length):
|
for i in range(length):
|
||||||
c = characters[i]
|
c = characters[i]
|
||||||
if c == ' ' or c == '\n' or c == '\t':
|
if Py_UNICODE_ISSPACE(c) == 1:
|
||||||
if start < i:
|
if start < i:
|
||||||
self._tokenize(tokens, &characters[start], i - start)
|
self._tokenize(tokens, &characters[start], i - start)
|
||||||
start = i + 1
|
start = i + 1
|
||||||
|
@ -101,38 +99,30 @@ cdef class Language:
|
||||||
self._tokenize(tokens, &characters[start], i - start)
|
self._tokenize(tokens, &characters[start], i - start)
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
cdef int _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) except -1:
|
cdef _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length):
|
||||||
|
cdef list lexemes
|
||||||
|
cdef size_t lex_addr
|
||||||
cdef uint64_t hashed = hash64(characters, length * sizeof(Py_UNICODE), 0)
|
cdef uint64_t hashed = hash64(characters, length * sizeof(Py_UNICODE), 0)
|
||||||
cdef LexList* node = <LexList*>self.cache[hashed]
|
if hashed in self.cache:
|
||||||
if node is not NULL:
|
for lex_addr in self.cache[hashed]:
|
||||||
while node != NULL:
|
tokens.push_back(<LexemeC*>lex_addr)
|
||||||
tokens.push_back(node.lex)
|
|
||||||
node = node.tail
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
node = <LexList*>calloc(1, sizeof(LexList))
|
lexemes = []
|
||||||
self.cache[hashed] = <size_t>node
|
|
||||||
cdef size_t start = 0
|
cdef size_t start = 0
|
||||||
cdef size_t split = 0
|
cdef size_t split = 0
|
||||||
while start < length:
|
while start < length:
|
||||||
split = self._split_one(&characters[start], length - start)
|
split = self._split_one(&characters[start], length - start)
|
||||||
node.lex = <LexemeC*>self.lexicon.get(&characters[start], split)
|
hashed = hash64(&characters[start], split * sizeof(Py_UNICODE), 0)
|
||||||
tokens.push_back(node.lex)
|
if hashed in self.cache:
|
||||||
start += split
|
lexemes.extend(self.cache[hashed])
|
||||||
if start >= length:
|
|
||||||
break
|
|
||||||
hashed = hash64(&characters[start], (length - start) * sizeof(Py_UNICODE), 0)
|
|
||||||
node.tail = <LexList*>self.cache[hashed]
|
|
||||||
if node.tail == NULL:
|
|
||||||
node.tail = <LexList*>calloc(1, sizeof(LexList))
|
|
||||||
self.cache[hashed] = <size_t>node.tail
|
|
||||||
node = node.tail
|
|
||||||
else:
|
else:
|
||||||
node = node.tail
|
lexeme = <LexemeC*>self.lexicon.get(&characters[start], split)
|
||||||
while node != NULL:
|
lexemes.append(<size_t>lexeme)
|
||||||
tokens.push_back(node.lex)
|
start += split
|
||||||
node = node.tail
|
for lex_addr in lexemes:
|
||||||
break
|
tokens.push_back(<LexemeC*>lex_addr)
|
||||||
|
#self.cache[hashed] = lexemes
|
||||||
|
|
||||||
cdef int _split_one(self, Py_UNICODE* characters, size_t length):
|
cdef int _split_one(self, Py_UNICODE* characters, size_t length):
|
||||||
return length
|
return length
|
||||||
|
@ -149,17 +139,14 @@ cdef class Language:
|
||||||
token_rules (list): A list of (chunk, tokens) pairs, where chunk is
|
token_rules (list): A list of (chunk, tokens) pairs, where chunk is
|
||||||
a string and tokens is a list of strings.
|
a string and tokens is a list of strings.
|
||||||
'''
|
'''
|
||||||
cdef LexList* node
|
cdef list lexemes
|
||||||
cdef uint64_t hashed
|
cdef uint64_t hashed
|
||||||
for string, substrings in token_rules:
|
for string, substrings in token_rules:
|
||||||
hashed = hash64(<Py_UNICODE*>string, len(string) * sizeof(Py_UNICODE), 0)
|
hashed = hash64(<Py_UNICODE*>string, len(string) * sizeof(Py_UNICODE), 0)
|
||||||
node = <LexList*>calloc(1, sizeof(LexList))
|
lexemes = []
|
||||||
self.cache[hashed] = <size_t>node
|
for substring in substrings:
|
||||||
for substring in substrings[:-1]:
|
lexemes.append(self.lexicon.get(<Py_UNICODE*>substring, len(substring)))
|
||||||
node.lex = <LexemeC*>self.lexicon.get(<Py_UNICODE*>substring, len(substring))
|
self.cache[hashed] = lexemes
|
||||||
node.tail = <LexList*>calloc(1, sizeof(LexList))
|
|
||||||
node = node.tail
|
|
||||||
node.lex = <LexemeC*>self.lexicon.get(<Py_UNICODE*>substrings[-1], len(substrings[-1]))
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Lexicon:
|
cdef class Lexicon:
|
||||||
|
|
|
@ -26,6 +26,8 @@ def test_punct():
|
||||||
|
|
||||||
def test_digits():
|
def test_digits():
|
||||||
lex_ids = EN.tokenize('The year: 1984.')
|
lex_ids = EN.tokenize('The year: 1984.')
|
||||||
|
assert lex_ids.string(4) == "."
|
||||||
|
assert lex_ids.string(3) == "1984"
|
||||||
assert len(lex_ids) == 5
|
assert len(lex_ids) == 5
|
||||||
assert lex_ids[0].string == EN.lexicon.lookup('The').string
|
assert lex_ids[0].string == EN.lexicon.lookup('The').string
|
||||||
assert lex_ids[3].string == EN.lexicon.lookup('1984').string
|
assert lex_ids[3].string == EN.lexicon.lookup('1984').string
|
||||||
|
@ -37,5 +39,17 @@ def test_contraction():
|
||||||
assert len(lex_ids) == 3
|
assert len(lex_ids) == 3
|
||||||
assert lex_ids[1].string == EN.lexicon.lookup("not").string
|
assert lex_ids[1].string == EN.lexicon.lookup("not").string
|
||||||
lex_ids = EN.tokenize("i said don't!")
|
lex_ids = EN.tokenize("i said don't!")
|
||||||
assert len(lex_ids) == 4
|
assert len(lex_ids) == 5
|
||||||
assert lex_ids[3].string == EN.lexicon.lookup('!').string
|
assert lex_ids[4].string == EN.lexicon.lookup('!').string
|
||||||
|
|
||||||
|
|
||||||
|
def test_contraction_punct():
|
||||||
|
tokens = EN.tokenize("(can't")
|
||||||
|
assert len(tokens) == 3
|
||||||
|
tokens = EN.tokenize("`ain't")
|
||||||
|
assert len(tokens) == 3
|
||||||
|
tokens = EN.tokenize('''"isn't''')
|
||||||
|
assert len(tokens) == 3
|
||||||
|
tokens = EN.tokenize("can't!")
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user