* Slight cleaning of tokenizer code

This commit is contained in:
Matthew Honnibal 2014-10-10 19:17:22 +11:00
parent 59b41a9fd3
commit 71ee921055
7 changed files with 31 additions and 26 deletions

View File

@ -41,6 +41,7 @@ cdef class Lexicon:
cdef class Language: cdef class Language:
cdef Pool _mem cdef Pool _mem
cdef unicode name cdef unicode name
cdef vector[size_t] counts
cdef PreshMap cache cdef PreshMap cache
cdef PreshMap specials cdef PreshMap specials
cpdef readonly Lexicon lexicon cpdef readonly Lexicon lexicon
@ -51,7 +52,6 @@ cdef class Language:
cpdef Tokens tokenize(self, unicode text) cpdef Tokens tokenize(self, unicode text)
cpdef Lexeme lookup(self, unicode text) cpdef Lexeme lookup(self, unicode text)
cdef int _check_cache(self, vector[LexemeC*] *tokens, String* string) except -1
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1 cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) cdef int _find_prefix(self, Py_UNICODE* characters, size_t length)
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) cdef int _find_suffix(self, Py_UNICODE* characters, size_t length)

View File

@ -16,6 +16,7 @@ import re
from .util import read_lang_data from .util import read_lang_data
from spacy.tokens import Tokens from spacy.tokens import Tokens
from spacy.lexeme cimport LexemeC, get_lexeme_dict, lexeme_pack, lexeme_unpack from spacy.lexeme cimport LexemeC, get_lexeme_dict, lexeme_pack, lexeme_unpack
from spacy.lexeme cimport LexStr_orig
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64
from cpython.ref cimport Py_INCREF from cpython.ref cimport Py_INCREF
@ -45,12 +46,20 @@ cdef class Language:
self.suffix_re = re.compile(suffix) self.suffix_re = re.compile(suffix)
self.lexicon = Lexicon(lexemes) self.lexicon = Lexicon(lexemes)
self._load_special_tokenization(rules) self._load_special_tokenization(rules)
self.counts = vector[size_t]()
property nr_types: property nr_types:
def __get__(self): def __get__(self):
"""Return the number of lexical types in the vocabulary""" """Return the number of lexical types in the vocabulary"""
return self.lexicon.size return self.lexicon.size
property counts:
def __get__(self):
cdef size_t i
for i in range(self.lexicon.size):
count = self.counts[i] if i < self.counts.size() else 0
yield count, self.lexicon.lexemes[i].strings[<int>LexStr_orig].decode('utf8')
cpdef Lexeme lookup(self, unicode string): cpdef Lexeme lookup(self, unicode string):
"""Retrieve (or create, if not found) a Lexeme for a string, and return it. """Retrieve (or create, if not found) a Lexeme for a string, and return it.
@ -85,23 +94,23 @@ cdef class Language:
cdef size_t start = 0 cdef size_t start = 0
cdef size_t i = 0 cdef size_t i = 0
cdef Py_UNICODE* chars = string cdef Py_UNICODE* chars = string
cdef Py_UNICODE c
cdef String span cdef String span
for i in range(length): for i in range(length):
c = chars[i] if Py_UNICODE_ISSPACE(chars[i]) == 1:
if Py_UNICODE_ISSPACE(c) == 1:
if start < i: if start < i:
string_from_slice(&span, chars, start, i) string_from_slice(&span, chars, start, i)
try: self._tokenize(tokens.v, &span)
self._tokenize(tokens.v, &span)
except MemoryError:
print chars[start:i]
raise
start = i + 1 start = i + 1
i += 1 i += 1
if start < i: if start < i:
string_from_slice(&span, chars, start, i) string_from_slice(&span, chars, start, i)
self._tokenize(tokens.v, &span) self._tokenize(tokens.v, &span)
cdef int id_
for i in range(tokens.v.size()):
id_ = tokens.id(i)
while id_ >= self.counts.size():
self.counts.push_back(0)
self.counts[id_] += 1
return tokens return tokens
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1: cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1:
@ -163,17 +172,6 @@ cdef class Language:
self._attach_tokens(tokens_v, string, &prefixes, &suffixes) self._attach_tokens(tokens_v, string, &prefixes, &suffixes)
self._save_cached(tokens_v, orig_key, orig_size) self._save_cached(tokens_v, orig_key, orig_size)
cdef int _check_cache(self, vector[LexemeC*] *tokens, String* string) except -1:
lexemes = <LexemeC**>self.cache.get(string.key)
cdef size_t i = 0
if lexemes != NULL:
while lexemes[i] != NULL:
tokens.push_back(lexemes[i])
i += 1
string.n = 0
string.key = 0
string.chars = NULL
cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string, cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string,
vector[LexemeC*] *prefixes, vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes) except -1: vector[LexemeC*] *suffixes) except -1:
@ -261,6 +259,7 @@ cdef class Lexicon:
lexeme = <LexemeC*>self._mem.alloc(1, sizeof(LexemeC)) lexeme = <LexemeC*>self._mem.alloc(1, sizeof(LexemeC))
lexeme_unpack(lexeme, lexeme_dict) lexeme_unpack(lexeme, lexeme_dict)
self._dict.set(string.key, lexeme) self._dict.set(string.key, lexeme)
self.lexemes.push_back(lexeme)
self.size += 1 self.size += 1
cdef LexemeC* get(self, String* string) except NULL: cdef LexemeC* get(self, String* string) except NULL:
@ -273,6 +272,7 @@ cdef class Lexicon:
cdef unicode unicode_string = string.chars[:string.n] cdef unicode unicode_string = string.chars[:string.n]
lexeme_unpack(lex, get_lexeme_dict(self.size, unicode_string)) lexeme_unpack(lex, get_lexeme_dict(self.size, unicode_string))
self._dict.set(string.key, lex) self._dict.set(string.key, lex)
self.lexemes.push_back(lex)
self.size += 1 self.size += 1
return lex return lex

View File

@ -21,7 +21,7 @@ cpdef enum LexFloats:
cpdef enum LexStrs: cpdef enum LexStrs:
LexStr_key LexStr_orig
LexStr_casefix LexStr_casefix
LexStr_shape LexStr_shape
LexStr_unsparse LexStr_unsparse
@ -70,6 +70,7 @@ cdef struct LexemeC:
flag_t orth_flags flag_t orth_flags
flag_t dist_flags flag_t dist_flags
cpdef dict get_lexeme_dict(size_t i, unicode string) cpdef dict get_lexeme_dict(size_t i, unicode string)
cdef char* intern_and_encode(unicode string, size_t* length) except NULL cdef char* intern_and_encode(unicode string, size_t* length) except NULL

View File

@ -19,8 +19,8 @@ cpdef dict get_lexeme_dict(size_t i, unicode string):
floats[<int>LexFloat_sentiment] = 0 floats[<int>LexFloat_sentiment] = 0
strings = [None for _ in range(LexStr_N)] strings = [None for _ in range(LexStr_N)]
strings[<int>LexStr_key] = string strings[<int>LexStr_orig] = string
strings[<int>LexStr_casefix] = strings[<int>LexStr_key] strings[<int>LexStr_casefix] = strings[<int>LexStr_orig]
strings[<int>LexStr_shape] = orth.word_shape(string) strings[<int>LexStr_shape] = orth.word_shape(string)
strings[<int>LexStr_unsparse] = strings[<int>LexStr_shape] strings[<int>LexStr_unsparse] = strings[<int>LexStr_shape]
strings[<int>LexStr_asciied] = orth.asciied(string) strings[<int>LexStr_asciied] = orth.asciied(string)
@ -42,9 +42,9 @@ def get_orth_flags(unicode string):
flags |= orth.is_space(string) << LexOrth_space flags |= orth.is_space(string) << LexOrth_space
flags |= orth.is_title(string) << LexOrth_title flags |= orth.is_title(string) << LexOrth_title
flags |= orth.is_upper(string) << LexOrth_upper flags |= orth.is_upper(string) << LexOrth_upper
return flags return flags
def get_dist_flags(unicode string): def get_dist_flags(unicode string):
return 0 return 0

View File

@ -5,6 +5,7 @@ from libcpp.vector cimport vector
cdef class Tokens: cdef class Tokens:
cdef vector[LexemeC*] *v cdef vector[LexemeC*] *v
cpdef int id(self, size_t i) except -1
cpdef unicode string(self, size_t i) cpdef unicode string(self, size_t i)
cpdef float prob(self, size_t i) except 1 cpdef float prob(self, size_t i) except 1
cpdef int cluster(self, size_t i) except * cpdef int cluster(self, size_t i) except *

View File

@ -40,8 +40,11 @@ cdef class Tokens:
def append(self, Lexeme lexeme): def append(self, Lexeme lexeme):
self.v.push_back(lexeme._c) self.v.push_back(lexeme._c)
cpdef int id(self, size_t i) except -1:
return self.v.at(i).ints[<int>LexInt_i]
cpdef unicode string(self, size_t i): cpdef unicode string(self, size_t i):
cdef bytes utf8_string = self.v.at(i).strings[<int>LexStr_key] cdef bytes utf8_string = self.v.at(i).strings[<int>LexStr_orig]
cdef unicode string = utf8_string.decode('utf8') cdef unicode string = utf8_string.decode('utf8')
return string return string

View File

@ -54,7 +54,7 @@ cdef class Lexeme:
property string: property string:
def __get__(self): def __get__(self):
cdef bytes utf8_string = self._c.strings[<int>LexStr_key] cdef bytes utf8_string = self._c.strings[<int>LexStr_orig]
cdef unicode string = utf8_string.decode('utf8') cdef unicode string = utf8_string.decode('utf8')
return string return string