* Replace UniStr, using unicode objects instead

This commit is contained in:
Matthew Honnibal 2015-07-22 04:49:39 +02:00
parent 386246db5b
commit 109106a949
5 changed files with 90 additions and 106 deletions

View File

@ -5,7 +5,6 @@ from .typedefs cimport attr_t
from libc.stdint cimport int64_t from libc.stdint cimport int64_t
from .structs cimport UniStr
from .typedefs cimport hash_t from .typedefs cimport hash_t
cpdef hash_t hash_string(unicode string) except 0 cpdef hash_t hash_string(unicode string) except 0
@ -16,12 +15,6 @@ ctypedef union Utf8Str:
unsigned char* p unsigned char* p
cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end) nogil:
s.chars = &chars[start]
s.n = end - start
s.key = hash64(s.chars, <int>(s.n * sizeof(Py_UNICODE)), 0)
cdef class StringStore: cdef class StringStore:
cdef Pool mem cdef Pool mem
cdef Utf8Str* c cdef Utf8Str* c

View File

@ -1,12 +1,10 @@
from libcpp.vector cimport vector from libcpp.vector cimport vector
from cpython cimport Py_UNICODE_ISSPACE, Py_UNICODE_ISALPHA, Py_UNICODE_ISUPPER
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from .typedefs cimport hash_t from .typedefs cimport hash_t
from .structs cimport LexemeC, TokenC, Morphology, UniStr from .structs cimport LexemeC, TokenC, Morphology
from .strings cimport StringStore from .strings cimport StringStore
from .tokens.doc cimport Doc from .tokens.doc cimport Doc
from .vocab cimport Vocab, _Cached from .vocab cimport Vocab, _Cached
@ -30,12 +28,10 @@ cdef class Tokenizer:
cpdef Doc tokens_from_list(self, list strings) cpdef Doc tokens_from_list(self, list strings)
cdef int _try_cache(self, hash_t key, Doc tokens) except -1 cdef int _try_cache(self, hash_t key, Doc tokens) except -1
cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1 cdef int _tokenize(self, Doc tokens, unicode span, hash_t key) except -1
cdef UniStr* _split_affixes(self, UniStr* string, vector[LexemeC*] *prefixes, cdef unicode _split_affixes(self, unicode string, vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes) except NULL vector[LexemeC*] *suffixes)
cdef int _attach_tokens(self, Doc tokens, int idx, UniStr* string, cdef int _attach_tokens(self, Doc tokens, unicode string,
vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1 vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
cdef object _find_infix(self, Py_UNICODE* characters, size_t length)
cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1 cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1

View File

@ -6,13 +6,14 @@ import re
from cython.operator cimport dereference as deref from cython.operator cimport dereference as deref
from cython.operator cimport preincrement as preinc from cython.operator cimport preincrement as preinc
from cpython cimport Py_UNICODE_ISSPACE
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap
from .structs cimport UniStr
from .strings cimport slice_unicode
from .morphology cimport set_morph_from_dict from .morphology cimport set_morph_from_dict
from .strings cimport hash_string
cimport cython
from . import util from . import util
from .util import read_lang_data from .util import read_lang_data
@ -42,17 +43,16 @@ cdef class Tokenizer:
cdef Doc tokens = Doc(self.vocab) cdef Doc tokens = Doc(self.vocab)
if sum([len(s) for s in strings]) == 0: if sum([len(s) for s in strings]) == 0:
return tokens return tokens
cdef UniStr string_struct
cdef unicode py_string cdef unicode py_string
cdef int idx = 0 cdef int idx = 0
for i, py_string in enumerate(strings): for i, py_string in enumerate(strings):
slice_unicode(&string_struct, py_string, 0, len(py_string))
# Note that we pass tokens.mem here --- the Doc object has ownership # Note that we pass tokens.mem here --- the Doc object has ownership
tokens.push_back( tokens.push_back(
<const LexemeC*>self.vocab.get(tokens.mem, &string_struct), True) <const LexemeC*>self.vocab.get(tokens.mem, py_string), True)
idx += len(py_string) + 1 idx += len(py_string) + 1
return tokens return tokens
@cython.boundscheck(False)
def __call__(self, unicode string): def __call__(self, unicode string):
"""Tokenize a string. """Tokenize a string.
@ -80,16 +80,21 @@ cdef class Tokenizer:
cdef int i = 0 cdef int i = 0
cdef int start = 0 cdef int start = 0
cdef bint cache_hit cdef bint cache_hit
cdef Py_UNICODE* chars = string chars = <Py_UNICODE*>string
cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0]) cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
cdef UniStr span cdef unicode span
# The task here is much like string.split, but not quite
# We find spans of whitespace and non-space characters, and ignore
# spans that are exactly ' '. So, our sequences will all be separated
# by either ' ' or nothing.
for i in range(1, length): for i in range(1, length):
if Py_UNICODE_ISSPACE(chars[i]) != in_ws: if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
if start < i: if start < i:
slice_unicode(&span, chars, start, i) span = string[start:i]
cache_hit = self._try_cache(span.key, tokens) key = hash_string(span)
cache_hit = self._try_cache(key, tokens)
if not cache_hit: if not cache_hit:
self._tokenize(tokens, &span, start, i) self._tokenize(tokens, span, key)
in_ws = not in_ws in_ws = not in_ws
start = i start = i
if chars[i] == ' ': if chars[i] == ' ':
@ -97,10 +102,11 @@ cdef class Tokenizer:
start += 1 start += 1
i += 1 i += 1
if start < i: if start < i:
slice_unicode(&span, chars, start, i) span = string[start:i]
cache_hit = self._try_cache(span.key, tokens) key = hash_string(span)
cache_hit = self._try_cache(key, tokens)
if not cache_hit: if not cache_hit:
self._tokenize(tokens, &span, start, i) self._tokenize(tokens, span, key)
tokens.data[tokens.length - 1].spacy = string[-1] == ' ' tokens.data[tokens.length - 1].spacy = string[-1] == ' '
return tokens return tokens
@ -118,91 +124,89 @@ cdef class Tokenizer:
tokens.push_back(&cached.data.tokens[i], False) tokens.push_back(&cached.data.tokens[i], False)
return True return True
cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1: cdef int _tokenize(self, Doc tokens, unicode span, hash_t orig_key) except -1:
cdef vector[LexemeC*] prefixes cdef vector[LexemeC*] prefixes
cdef vector[LexemeC*] suffixes cdef vector[LexemeC*] suffixes
cdef hash_t orig_key
cdef int orig_size cdef int orig_size
orig_key = span.key
orig_size = tokens.length orig_size = tokens.length
self._split_affixes(span, &prefixes, &suffixes) span = self._split_affixes(span, &prefixes, &suffixes)
self._attach_tokens(tokens, start, span, &prefixes, &suffixes) self._attach_tokens(tokens, span, &prefixes, &suffixes)
self._save_cached(&tokens.data[orig_size], orig_key, tokens.length - orig_size) self._save_cached(&tokens.data[orig_size], orig_key, tokens.length - orig_size)
cdef UniStr* _split_affixes(self, UniStr* string, vector[const LexemeC*] *prefixes, cdef unicode _split_affixes(self, unicode string, vector[const LexemeC*] *prefixes,
vector[const LexemeC*] *suffixes) except NULL: vector[const LexemeC*] *suffixes):
cdef size_t i cdef size_t i
cdef UniStr prefix cdef unicode prefix
cdef UniStr suffix cdef unicode suffix
cdef UniStr minus_pre cdef unicode minus_pre
cdef UniStr minus_suf cdef unicode minus_suf
cdef size_t last_size = 0 cdef size_t last_size = 0
while string.n != 0 and string.n != last_size: while string and len(string) != last_size:
last_size = string.n last_size = len(string)
pre_len = self._find_prefix(string.chars, string.n) pre_len = self.find_prefix(string)
if pre_len != 0: if pre_len != 0:
slice_unicode(&prefix, string.chars, 0, pre_len) prefix = string[:pre_len]
slice_unicode(&minus_pre, string.chars, pre_len, string.n) minus_pre = string[pre_len:]
# Check whether we've hit a special-case # Check whether we've hit a special-case
if minus_pre.n >= 1 and self._specials.get(minus_pre.key) != NULL: if minus_pre and self._specials.get(hash_string(minus_pre)) != NULL:
string[0] = minus_pre string = minus_pre
prefixes.push_back(self.vocab.get(self.vocab.mem, &prefix)) prefixes.push_back(self.vocab.get(self.vocab.mem, prefix))
break break
suf_len = self._find_suffix(string.chars, string.n) suf_len = self.find_suffix(string)
if suf_len != 0: if suf_len != 0:
slice_unicode(&suffix, string.chars, string.n - suf_len, string.n) suffix = string[-suf_len:]
slice_unicode(&minus_suf, string.chars, 0, string.n - suf_len) minus_suf = string[:-suf_len]
# Check whether we've hit a special-case # Check whether we've hit a special-case
if minus_suf.n >= 1 and self._specials.get(minus_suf.key) != NULL: if minus_suf and (self._specials.get(hash_string(minus_suf)) != NULL):
string[0] = minus_suf string = minus_suf
suffixes.push_back(self.vocab.get(self.vocab.mem, &suffix)) suffixes.push_back(self.vocab.get(self.vocab.mem, suffix))
break break
if pre_len and suf_len and (pre_len + suf_len) <= string.n: if pre_len and suf_len and (pre_len + suf_len) <= len(string):
slice_unicode(string, string.chars, pre_len, string.n - suf_len) string = string[pre_len:-suf_len]
prefixes.push_back(self.vocab.get(self.vocab.mem, &prefix)) prefixes.push_back(self.vocab.get(self.vocab.mem, prefix))
suffixes.push_back(self.vocab.get(self.vocab.mem, &suffix)) suffixes.push_back(self.vocab.get(self.vocab.mem, suffix))
elif pre_len: elif pre_len:
string[0] = minus_pre string = minus_pre
prefixes.push_back(self.vocab.get(self.vocab.mem, &prefix)) prefixes.push_back(self.vocab.get(self.vocab.mem, prefix))
elif suf_len: elif suf_len:
string[0] = minus_suf string = minus_suf
suffixes.push_back(self.vocab.get(self.vocab.mem, &suffix)) suffixes.push_back(self.vocab.get(self.vocab.mem, suffix))
if self._specials.get(string.key): if string and (self._specials.get(hash_string(string)) != NULL):
break break
return string return string
cdef int _attach_tokens(self, Doc tokens, int idx, UniStr* string, cdef int _attach_tokens(self, Doc tokens, unicode string,
vector[const LexemeC*] *prefixes, vector[const LexemeC*] *prefixes,
vector[const LexemeC*] *suffixes) except -1: vector[const LexemeC*] *suffixes) except -1:
cdef bint cache_hit cdef bint cache_hit
cdef int split, end cdef int split, end
cdef const LexemeC* const* lexemes cdef const LexemeC* const* lexemes
cdef const LexemeC* lexeme cdef const LexemeC* lexeme
cdef UniStr span cdef unicode span
cdef int i cdef int i
if prefixes.size(): if prefixes.size():
for i in range(prefixes.size()): for i in range(prefixes.size()):
tokens.push_back(prefixes[0][i], False) tokens.push_back(prefixes[0][i], False)
if string.n != 0: if string:
cache_hit = self._try_cache(string.key, tokens) cache_hit = self._try_cache(hash_string(string), tokens)
if cache_hit: if cache_hit:
pass pass
else: else:
match = self._find_infix(string.chars, string.n) match = self.find_infix(string)
if match is None: if match is None:
tokens.push_back(self.vocab.get(tokens.mem, string), False) tokens.push_back(self.vocab.get(tokens.mem, string), False)
else: else:
split = match.start() split = match.start()
end = match.end() end = match.end()
# Append the beginning, afix, end of the infix span # Append the beginning, affix, end of the infix span
slice_unicode(&span, string.chars, 0, split) span = string[:split]
tokens.push_back(self.vocab.get(tokens.mem, &span), False) tokens.push_back(self.vocab.get(tokens.mem, span), False)
slice_unicode(&span, string.chars, split, end) span = string[split:end]
tokens.push_back(self.vocab.get(tokens.mem, &span), False) tokens.push_back(self.vocab.get(tokens.mem, span), False)
slice_unicode(&span, string.chars, end, string.n) span = string[end:]
tokens.push_back(self.vocab.get(tokens.mem, &span), False) tokens.push_back(self.vocab.get(tokens.mem, span), False)
cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin() cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
while it != suffixes.rend(): while it != suffixes.rend():
lexeme = deref(it) lexeme = deref(it)
@ -223,17 +227,14 @@ cdef class Tokenizer:
cached.data.lexemes = <const LexemeC* const*>lexemes cached.data.lexemes = <const LexemeC* const*>lexemes
self._cache.set(key, cached) self._cache.set(key, cached)
cdef object _find_infix(self, Py_UNICODE* chars, size_t length): def find_infix(self, unicode string):
cdef unicode string = chars[:length]
return self._infix_re.search(string) return self._infix_re.search(string)
cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1: def find_prefix(self, unicode string):
cdef unicode string = chars[:length]
match = self._prefix_re.search(string) match = self._prefix_re.search(string)
return (match.end() - match.start()) if match is not None else 0 return (match.end() - match.start()) if match is not None else 0
cdef int _find_suffix(self, Py_UNICODE* chars, size_t length) except -1: def find_suffix(self, unicode string):
cdef unicode string = chars[:length]
match = self._suffix_re.search(string) match = self._suffix_re.search(string)
return (match.end() - match.start()) if match is not None else 0 return (match.end() - match.start()) if match is not None else 0
@ -241,21 +242,19 @@ cdef class Tokenizer:
'''Add a special-case tokenization rule. '''Add a special-case tokenization rule.
''' '''
cdef int i cdef int i
cdef unicode chunk
cdef list substrings cdef list substrings
cdef unicode chunk
cdef unicode form cdef unicode form
cdef unicode lemma cdef unicode lemma
cdef dict props cdef dict props
cdef LexemeC** lexemes cdef LexemeC** lexemes
cdef hash_t hashed cdef hash_t hashed
cdef UniStr string
for chunk, substrings in sorted(rules.items()): for chunk, substrings in sorted(rules.items()):
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC)) tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
for i, props in enumerate(substrings): for i, props in enumerate(substrings):
form = props['F'] form = props['F']
lemma = props.get("L", None) lemma = props.get("L", None)
slice_unicode(&string, form, 0, len(form)) tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, form)
tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, &string)
if lemma is not None: if lemma is not None:
tokens[i].lemma = self.vocab.strings[lemma] tokens[i].lemma = self.vocab.strings[lemma]
else: else:
@ -273,6 +272,6 @@ cdef class Tokenizer:
cached.length = len(substrings) cached.length = len(substrings)
cached.is_lex = False cached.is_lex = False
cached.data.tokens = tokens cached.data.tokens = tokens
slice_unicode(&string, chunk, 0, len(chunk)) hashed = hash_string(chunk)
self._specials.set(string.key, cached) self._specials.set(hashed, cached)
self._cache.set(string.key, cached) self._cache.set(hashed, cached)

View File

@ -4,7 +4,7 @@ from preshed.maps cimport PreshMap
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64
from .structs cimport LexemeC, TokenC, UniStr from .structs cimport LexemeC, TokenC
from .typedefs cimport utf8_t, hash_t from .typedefs cimport utf8_t, hash_t
from .strings cimport StringStore from .strings cimport StringStore
@ -31,7 +31,7 @@ cdef class Vocab:
cdef readonly int length cdef readonly int length
cdef public object packer cdef public object packer
cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
cdef PreshMap _by_hash cdef PreshMap _by_hash

View File

@ -10,7 +10,6 @@ import math
from .lexeme cimport EMPTY_LEXEME from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport set_lex_struct_props from .lexeme cimport set_lex_struct_props
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
from .strings cimport slice_unicode
from .strings cimport hash_string from .strings cimport hash_string
from .orth cimport word_shape from .orth cimport word_shape
from .typedefs cimport attr_t from .typedefs cimport attr_t
@ -62,25 +61,25 @@ cdef class Vocab:
"""The current number of lexemes stored.""" """The current number of lexemes stored."""
return self.length return self.length
cdef const LexemeC* get(self, Pool mem, UniStr* c_str) except NULL: cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
if necessary, using memory acquired from the given pool. If the pool if necessary, using memory acquired from the given pool. If the pool
is the lexicon's own memory, the lexeme is saved in the lexicon.''' is the lexicon's own memory, the lexeme is saved in the lexicon.'''
cdef LexemeC* lex cdef LexemeC* lex
lex = <LexemeC*>self._by_hash.get(c_str.key) cdef hash_t key = hash_string(string)
lex = <LexemeC*>self._by_hash.get(key)
if lex != NULL: if lex != NULL:
return lex return lex
cdef bint is_oov = mem is not self.mem cdef bint is_oov = mem is not self.mem
if c_str.n < 3: if len(string) < 3:
mem = self.mem mem = self.mem
cdef unicode py_str = c_str.chars[:c_str.n]
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1) lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
props = self.lexeme_props_getter(py_str) props = self.lexeme_props_getter(string)
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC) set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
if is_oov: if is_oov:
lex.id = 0 lex.id = 0
else: else:
self._add_lex_to_vocab(c_str.key, lex) self._add_lex_to_vocab(key, lex)
return lex return lex
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1: cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
@ -109,7 +108,6 @@ cdef class Vocab:
An instance of the Lexeme Python class, with data copied on An instance of the Lexeme Python class, with data copied on
instantiation. instantiation.
''' '''
cdef UniStr c_str
cdef const LexemeC* lexeme cdef const LexemeC* lexeme
cdef attr_t orth cdef attr_t orth
if type(id_or_string) == int: if type(id_or_string) == int:
@ -119,8 +117,7 @@ cdef class Vocab:
raise KeyError(id_or_string) raise KeyError(id_or_string)
assert lexeme.orth == orth, ('%d vs %d' % (lexeme.orth, orth)) assert lexeme.orth == orth, ('%d vs %d' % (lexeme.orth, orth))
elif type(id_or_string) == unicode: elif type(id_or_string) == unicode:
slice_unicode(&c_str, id_or_string, 0, len(id_or_string)) lexeme = self.get(self.mem, id_or_string)
lexeme = self.get(self.mem, &c_str)
assert lexeme.orth == self.strings[id_or_string] assert lexeme.orth == self.strings[id_or_string]
else: else:
raise ValueError("Vocab unable to map type: " raise ValueError("Vocab unable to map type: "
@ -128,15 +125,14 @@ cdef class Vocab:
"int --> Lexeme" % str(type(id_or_string))) "int --> Lexeme" % str(type(id_or_string)))
return Lexeme.from_ptr(lexeme, self.strings, self.repvec_length) return Lexeme.from_ptr(lexeme, self.strings, self.repvec_length)
def __setitem__(self, unicode py_str, dict props): def __setitem__(self, unicode string, dict props):
cdef UniStr c_str cdef hash_t key = hash_string(string)
slice_unicode(&c_str, py_str, 0, len(py_str))
cdef LexemeC* lex cdef LexemeC* lex
lex = <LexemeC*>self._by_hash.get(c_str.key) lex = <LexemeC*>self._by_hash.get(key)
if lex == NULL: if lex == NULL:
lex = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1) lex = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC) set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
self._add_lex_to_vocab(c_str.key, lex) self._add_lex_to_vocab(key, lex)
def dump(self, loc): def dump(self, loc):
if path.exists(loc): if path.exists(loc):