* Slight cleaning of tokenizer code

This commit is contained in:
Matthew Honnibal 2014-10-10 19:17:22 +11:00
parent 59b41a9fd3
commit 71ee921055
7 changed files with 31 additions and 26 deletions

View File

@ -41,6 +41,7 @@ cdef class Lexicon:
cdef class Language:
cdef Pool _mem
cdef unicode name
cdef vector[size_t] counts
cdef PreshMap cache
cdef PreshMap specials
cpdef readonly Lexicon lexicon
@ -51,7 +52,6 @@ cdef class Language:
cpdef Tokens tokenize(self, unicode text)
cpdef Lexeme lookup(self, unicode text)
cdef int _check_cache(self, vector[LexemeC*] *tokens, String* string) except -1
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length)
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length)

View File

@ -16,6 +16,7 @@ import re
from .util import read_lang_data
from spacy.tokens import Tokens
from spacy.lexeme cimport LexemeC, get_lexeme_dict, lexeme_pack, lexeme_unpack
from spacy.lexeme cimport LexStr_orig
from murmurhash.mrmr cimport hash64
from cpython.ref cimport Py_INCREF
@ -45,12 +46,20 @@ cdef class Language:
self.suffix_re = re.compile(suffix)
self.lexicon = Lexicon(lexemes)
self._load_special_tokenization(rules)
self.counts = vector[size_t]()
property nr_types:
def __get__(self):
"""Return the number of lexical types in the vocabulary"""
return self.lexicon.size
property counts:
def __get__(self):
cdef size_t i
for i in range(self.lexicon.size):
count = self.counts[i] if i < self.counts.size() else 0
yield count, self.lexicon.lexemes[i].strings[<int>LexStr_orig].decode('utf8')
cpdef Lexeme lookup(self, unicode string):
"""Retrieve (or create, if not found) a Lexeme for a string, and return it.
@ -85,23 +94,23 @@ cdef class Language:
cdef size_t start = 0
cdef size_t i = 0
cdef Py_UNICODE* chars = string
cdef Py_UNICODE c
cdef String span
for i in range(length):
c = chars[i]
if Py_UNICODE_ISSPACE(c) == 1:
if Py_UNICODE_ISSPACE(chars[i]) == 1:
if start < i:
string_from_slice(&span, chars, start, i)
try:
self._tokenize(tokens.v, &span)
except MemoryError:
print chars[start:i]
raise
self._tokenize(tokens.v, &span)
start = i + 1
i += 1
if start < i:
string_from_slice(&span, chars, start, i)
self._tokenize(tokens.v, &span)
cdef int id_
for i in range(tokens.v.size()):
id_ = tokens.id(i)
while id_ >= self.counts.size():
self.counts.push_back(0)
self.counts[id_] += 1
return tokens
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1:
@ -163,17 +172,6 @@ cdef class Language:
self._attach_tokens(tokens_v, string, &prefixes, &suffixes)
self._save_cached(tokens_v, orig_key, orig_size)
cdef int _check_cache(self, vector[LexemeC*] *tokens, String* string) except -1:
lexemes = <LexemeC**>self.cache.get(string.key)
cdef size_t i = 0
if lexemes != NULL:
while lexemes[i] != NULL:
tokens.push_back(lexemes[i])
i += 1
string.n = 0
string.key = 0
string.chars = NULL
cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string,
vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes) except -1:
@ -261,6 +259,7 @@ cdef class Lexicon:
lexeme = <LexemeC*>self._mem.alloc(1, sizeof(LexemeC))
lexeme_unpack(lexeme, lexeme_dict)
self._dict.set(string.key, lexeme)
self.lexemes.push_back(lexeme)
self.size += 1
cdef LexemeC* get(self, String* string) except NULL:
@ -273,6 +272,7 @@ cdef class Lexicon:
cdef unicode unicode_string = string.chars[:string.n]
lexeme_unpack(lex, get_lexeme_dict(self.size, unicode_string))
self._dict.set(string.key, lex)
self.lexemes.push_back(lex)
self.size += 1
return lex

View File

@ -21,7 +21,7 @@ cpdef enum LexFloats:
cpdef enum LexStrs:
LexStr_key
LexStr_orig
LexStr_casefix
LexStr_shape
LexStr_unsparse
@ -70,6 +70,7 @@ cdef struct LexemeC:
flag_t orth_flags
flag_t dist_flags
cpdef dict get_lexeme_dict(size_t i, unicode string)
cdef char* intern_and_encode(unicode string, size_t* length) except NULL

View File

@ -19,8 +19,8 @@ cpdef dict get_lexeme_dict(size_t i, unicode string):
floats[<int>LexFloat_sentiment] = 0
strings = [None for _ in range(LexStr_N)]
strings[<int>LexStr_key] = string
strings[<int>LexStr_casefix] = strings[<int>LexStr_key]
strings[<int>LexStr_orig] = string
strings[<int>LexStr_casefix] = strings[<int>LexStr_orig]
strings[<int>LexStr_shape] = orth.word_shape(string)
strings[<int>LexStr_unsparse] = strings[<int>LexStr_shape]
strings[<int>LexStr_asciied] = orth.asciied(string)
@ -42,9 +42,9 @@ def get_orth_flags(unicode string):
flags |= orth.is_space(string) << LexOrth_space
flags |= orth.is_title(string) << LexOrth_title
flags |= orth.is_upper(string) << LexOrth_upper
return flags
def get_dist_flags(unicode string):
return 0

View File

@ -5,6 +5,7 @@ from libcpp.vector cimport vector
cdef class Tokens:
cdef vector[LexemeC*] *v
cpdef int id(self, size_t i) except -1
cpdef unicode string(self, size_t i)
cpdef float prob(self, size_t i) except 1
cpdef int cluster(self, size_t i) except *

View File

@ -40,8 +40,11 @@ cdef class Tokens:
def append(self, Lexeme lexeme):
self.v.push_back(lexeme._c)
cpdef int id(self, size_t i) except -1:
return self.v.at(i).ints[<int>LexInt_i]
cpdef unicode string(self, size_t i):
cdef bytes utf8_string = self.v.at(i).strings[<int>LexStr_key]
cdef bytes utf8_string = self.v.at(i).strings[<int>LexStr_orig]
cdef unicode string = utf8_string.decode('utf8')
return string

View File

@ -54,7 +54,7 @@ cdef class Lexeme:
property string:
def __get__(self):
cdef bytes utf8_string = self._c.strings[<int>LexStr_key]
cdef bytes utf8_string = self._c.strings[<int>LexStr_orig]
cdef unicode string = utf8_string.decode('utf8')
return string