From 71ee92105598a6372ea3eb2edea1e56f1f0a83b6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 10 Oct 2014 19:17:22 +1100 Subject: [PATCH] * Slight cleaning of tokenizer code --- spacy/lang.pxd | 2 +- spacy/lang.pyx | 38 +++++++++++++++++++------------------- spacy/lexeme.pxd | 3 ++- spacy/lexeme.pyx | 6 +++--- spacy/tokens.pxd | 1 + spacy/tokens.pyx | 5 ++++- spacy/word.pyx | 2 +- 7 files changed, 31 insertions(+), 26 deletions(-) diff --git a/spacy/lang.pxd b/spacy/lang.pxd index 906b9231f..fc41e7851 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -41,6 +41,7 @@ cdef class Lexicon: cdef class Language: cdef Pool _mem cdef unicode name + cdef vector[size_t] counts cdef PreshMap cache cdef PreshMap specials cpdef readonly Lexicon lexicon @@ -51,7 +52,6 @@ cdef class Language: cpdef Tokens tokenize(self, unicode text) cpdef Lexeme lookup(self, unicode text) - cdef int _check_cache(self, vector[LexemeC*] *tokens, String* string) except -1 cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1 cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 73f5d358a..831d79999 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -16,6 +16,7 @@ import re from .util import read_lang_data from spacy.tokens import Tokens from spacy.lexeme cimport LexemeC, get_lexeme_dict, lexeme_pack, lexeme_unpack +from spacy.lexeme cimport LexStr_orig from murmurhash.mrmr cimport hash64 from cpython.ref cimport Py_INCREF @@ -45,12 +46,20 @@ cdef class Language: self.suffix_re = re.compile(suffix) self.lexicon = Lexicon(lexemes) self._load_special_tokenization(rules) + self.counts = vector[size_t]() property nr_types: def __get__(self): """Return the number of lexical types in the vocabulary""" return self.lexicon.size + property counts: + def __get__(self): + cdef size_t i + for i in range(self.lexicon.size): + count = self.counts[i] if i < self.counts.size() else 0 + yield count, self.lexicon.lexemes[i].strings[LexStr_orig].decode('utf8') + cpdef Lexeme lookup(self, unicode string): """Retrieve (or create, if not found) a Lexeme for a string, and return it. @@ -85,23 +94,23 @@ cdef class Language: cdef size_t start = 0 cdef size_t i = 0 cdef Py_UNICODE* chars = string - cdef Py_UNICODE c cdef String span for i in range(length): - c = chars[i] - if Py_UNICODE_ISSPACE(c) == 1: + if Py_UNICODE_ISSPACE(chars[i]) == 1: if start < i: string_from_slice(&span, chars, start, i) - try: - self._tokenize(tokens.v, &span) - except MemoryError: - print chars[start:i] - raise + self._tokenize(tokens.v, &span) start = i + 1 i += 1 if start < i: string_from_slice(&span, chars, start, i) self._tokenize(tokens.v, &span) + cdef int id_ + for i in range(tokens.v.size()): + id_ = tokens.id(i) + while id_ >= self.counts.size(): + self.counts.push_back(0) + self.counts[id_] += 1 return tokens cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1: @@ -163,17 +172,6 @@ cdef class Language: self._attach_tokens(tokens_v, string, &prefixes, &suffixes) self._save_cached(tokens_v, orig_key, orig_size) - cdef int _check_cache(self, vector[LexemeC*] *tokens, String* string) except -1: - lexemes = self.cache.get(string.key) - cdef size_t i = 0 - if lexemes != NULL: - while lexemes[i] != NULL: - tokens.push_back(lexemes[i]) - i += 1 - string.n = 0 - string.key = 0 - string.chars = NULL - cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string, vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1: @@ -261,6 +259,7 @@ cdef class Lexicon: lexeme = self._mem.alloc(1, sizeof(LexemeC)) lexeme_unpack(lexeme, lexeme_dict) self._dict.set(string.key, lexeme) + self.lexemes.push_back(lexeme) self.size += 1 cdef LexemeC* get(self, String* string) except NULL: @@ -273,6 +272,7 @@ cdef class Lexicon: cdef unicode unicode_string = string.chars[:string.n] lexeme_unpack(lex, get_lexeme_dict(self.size, unicode_string)) self._dict.set(string.key, lex) + self.lexemes.push_back(lex) self.size += 1 return lex diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index d7c85619d..11b40e0e8 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -21,7 +21,7 @@ cpdef enum LexFloats: cpdef enum LexStrs: - LexStr_key + LexStr_orig LexStr_casefix LexStr_shape LexStr_unsparse @@ -70,6 +70,7 @@ cdef struct LexemeC: flag_t orth_flags flag_t dist_flags + cpdef dict get_lexeme_dict(size_t i, unicode string) cdef char* intern_and_encode(unicode string, size_t* length) except NULL diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index b84ed4a02..8df0e554c 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -19,8 +19,8 @@ cpdef dict get_lexeme_dict(size_t i, unicode string): floats[LexFloat_sentiment] = 0 strings = [None for _ in range(LexStr_N)] - strings[LexStr_key] = string - strings[LexStr_casefix] = strings[LexStr_key] + strings[LexStr_orig] = string + strings[LexStr_casefix] = strings[LexStr_orig] strings[LexStr_shape] = orth.word_shape(string) strings[LexStr_unsparse] = strings[LexStr_shape] strings[LexStr_asciied] = orth.asciied(string) @@ -42,9 +42,9 @@ def get_orth_flags(unicode string): flags |= orth.is_space(string) << LexOrth_space flags |= orth.is_title(string) << LexOrth_title flags |= orth.is_upper(string) << LexOrth_upper - return flags + def get_dist_flags(unicode string): return 0 diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index b138387bf..8fd58ea8c 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -5,6 +5,7 @@ from libcpp.vector cimport vector cdef class Tokens: cdef vector[LexemeC*] *v + cpdef int id(self, size_t i) except -1 cpdef unicode string(self, size_t i) cpdef float prob(self, size_t i) except 1 cpdef int cluster(self, size_t i) except * diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 18f0c1533..c15ad7de1 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -40,8 +40,11 @@ cdef class Tokens: def append(self, Lexeme lexeme): self.v.push_back(lexeme._c) + cpdef int id(self, size_t i) except -1: + return self.v.at(i).ints[LexInt_i] + cpdef unicode string(self, size_t i): - cdef bytes utf8_string = self.v.at(i).strings[LexStr_key] + cdef bytes utf8_string = self.v.at(i).strings[LexStr_orig] cdef unicode string = utf8_string.decode('utf8') return string diff --git a/spacy/word.pyx b/spacy/word.pyx index 617e8809f..ab4ee6b68 100644 --- a/spacy/word.pyx +++ b/spacy/word.pyx @@ -54,7 +54,7 @@ cdef class Lexeme: property string: def __get__(self): - cdef bytes utf8_string = self._c.strings[LexStr_key] + cdef bytes utf8_string = self._c.strings[LexStr_orig] cdef unicode string = utf8_string.decode('utf8') return string