diff --git a/spacy/__init__.pxd b/spacy/__init__.pxd new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 0f8d21a24..d106f172a 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -155,8 +155,10 @@ cdef class Language: cdef Lexeme** lexemes cdef Lexeme* lexeme cdef String span - idx = tokens.extend(idx, prefixes.data(), prefixes.size()) + if prefixes.size(): + idx = tokens.extend(idx, prefixes.data(), prefixes.size()) if string.n != 0: + lexemes = self.cache.get(string.key) if lexemes != NULL: idx = tokens.extend(idx, lexemes, 0) diff --git a/spacy/pos_util.py b/spacy/pos_util.py index 5b020eaed..f8aa8edb7 100644 --- a/spacy/pos_util.py +++ b/spacy/pos_util.py @@ -42,7 +42,8 @@ def _parse_line(line, sep): assert len(subtags) == len(subtokens), [t.string for t in subtokens] words.append(word) tags.extend([Tagger.encode_pos(ptb_to_univ(pos)) for pos in subtags]) - return EN.tokenize(' '.join(words)), tags + tokens = EN.tokenize(' '.join(words)), tags + return tokens def get_tagdict(train_sents): diff --git a/spacy/utf8string.pxd b/spacy/utf8string.pxd new file mode 100644 index 000000000..82ae50022 --- /dev/null +++ b/spacy/utf8string.pxd @@ -0,0 +1,21 @@ +from preshed.maps cimport PreshMap +from cymem.cymem cimport Pool + +from .typedefs cimport utf8_t, id_t, hash_t + + +cdef struct Utf8Str: + id_t i + hash_t key + utf8_t chars + int length + + +cdef class StringStore: + cdef Pool mem + cdef PreshMap table + cdef Utf8Str* strings + cdef int size + cdef int _resize_at + + cdef Utf8Str* intern(self, char* chars, int length) except NULL diff --git a/spacy/utf8string.pyx b/spacy/utf8string.pyx new file mode 100644 index 000000000..d3bc3a4fe --- /dev/null +++ b/spacy/utf8string.pyx @@ -0,0 +1,53 @@ +from libc.string cimport memcpy + +from murmurhash.mrmr cimport hash64 + + +cdef class StringStore: + def __init__(self): + self.mem = Pool() + self.table = PreshMap() + self._resize_at = 10000 + self.strings = self.mem.alloc(self._resize_at, sizeof(Utf8Str)) + self.size = 1 + + property size: + def __get__(self): + return self.size-1 + + def __getitem__(self, string_or_id): + cdef bytes byte_string + cdef Utf8Str* utf8str + if type(string_or_id) == int or type(string_or_id) == long: + if string_or_id < 1 or string_or_id >= self.size: + raise IndexError(string_or_id) + utf8str = &self.strings[string_or_id] + return utf8str.chars[:utf8str.length] + elif type(string_or_id) == bytes: + utf8str = self.intern(string_or_id, len(string_or_id)) + return utf8str.i + else: + raise TypeError(type(string_or_id)) + + cdef Utf8Str* intern(self, char* chars, int length) except NULL: + # 0 means missing, but we don't bother offsetting the index. We waste + # slot 0 to simplify the code, because it doesn't matter. + assert length != 0 + cdef hash_t key = hash64(chars, length * sizeof(char), 0) + cdef void* value = self.table.get(key) + cdef size_t i + if value == NULL: + if self.size == self._resize_at: + self._resize_at *= 2 + self.strings = self.mem.realloc(self.strings, self._resize_at * sizeof(Utf8Str)) + i = self.size + self.strings[i].i = self.size + self.strings[i].key = key + self.strings[i].chars = self.mem.alloc(length, sizeof(char)) + memcpy(self.strings[i].chars, chars, length) + self.strings[i].length = length + self.table.set(key, self.size) + self.size += 1 + else: + i = value + return &self.strings[i]