diff --git a/spacy/en.pxd b/spacy/en.pxd index a8eb0b060..23e11d1dc 100644 --- a/spacy/en.pxd +++ b/spacy/en.pxd @@ -1,23 +1,21 @@ from libcpp.vector cimport vector from spacy.spacy cimport StringHash -from spacy.lexeme cimport Lexeme -from spacy.lexeme cimport LexID -from spacy.lexeme cimport ClusterID from spacy.spacy cimport Language +from spacy.word cimport Word from spacy.tokens cimport Tokens cimport cython cdef class English(spacy.Language): cdef int find_split(self, unicode word) - cdef int set_orth(self, unicode word, Lexeme* lex) except -1 + cdef int set_orth(self, unicode word, Word lex) except -1 cdef English EN -cpdef LexID lookup(unicode word) except 0 -cpdef Tokens tokenize(unicode string) +cpdef Word lookup(unicode word) +cpdef list tokenize(unicode string) cpdef unicode unhash(StringHash hash_value) diff --git a/spacy/en.pyx b/spacy/en.pyx index a35f93950..9045a692e 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -45,7 +45,6 @@ cimport spacy from spacy.orthography.latin cimport * -from spacy.lexeme cimport * from .orthography.latin import * from .lexeme import * @@ -96,7 +95,7 @@ cdef bint check_punct(unicode word, size_t i, size_t length): EN = English('en') -cpdef Tokens tokenize(unicode string): +cpdef list tokenize(unicode string): """Tokenize a string. The tokenization rules are defined in two places: @@ -113,7 +112,7 @@ cpdef Tokens tokenize(unicode string): return EN.tokenize(string) -cpdef LexID lookup(unicode string) except 0: +cpdef Word lookup(unicode string): """Retrieve (or create, if not found) a Lexeme for a string, and return its ID. Properties of the Lexeme are accessed by passing LexID to the accessor methods. @@ -125,7 +124,7 @@ cpdef LexID lookup(unicode string) except 0: Returns: lexeme (LexID): A reference to a lexical type. """ - return EN.lookup(string) + return EN.lookup(string) cpdef unicode unhash(StringHash hash_value): diff --git a/spacy/spacy.pxd b/spacy/spacy.pxd index 91b361f0a..7b6a38a1a 100644 --- a/spacy/spacy.pxd +++ b/spacy/spacy.pxd @@ -1,21 +1,9 @@ -from libcpp.vector cimport vector from libc.stdint cimport uint32_t from libc.stdint cimport uint64_t +from spacy.word cimport Word -# Circular import problems here -ctypedef size_t Lexeme_addr ctypedef uint32_t StringHash -from spacy.lexeme cimport Lexeme -from spacy.tokens cimport Tokens - -# Put these above import to avoid circular import problem -ctypedef char Bits8 -ctypedef uint64_t Bits64 -ctypedef int ClusterID - - -from spacy.lexeme cimport Lexeme cdef class Language: @@ -24,16 +12,16 @@ cdef class Language: cdef dict vocab cdef dict bacov - cpdef Tokens tokenize(self, unicode text) + cpdef list tokenize(self, unicode text) - cdef Lexeme* lookup(self, unicode string) except NULL - cdef Lexeme** lookup_chunk(self, unicode chunk) except NULL + cdef Word lookup(self, unicode string) + cdef list lookup_chunk(self, unicode chunk) - cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL - cdef Lexeme* new_lexeme(self, unicode lex) except NULL + cdef list new_chunk(self, unicode string, list substrings) + cdef Word new_lexeme(self, unicode lex) cpdef unicode unhash(self, StringHash hashed) cpdef list find_substrings(self, unicode chunk) cdef int find_split(self, unicode word) - cdef int set_orth(self, unicode string, Lexeme* word) + cdef int set_orth(self, unicode string, Word word) diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx index 57877250f..10f89a2ed 100644 --- a/spacy/spacy.pyx +++ b/spacy/spacy.pyx @@ -14,9 +14,6 @@ from libc.stdlib cimport calloc, free from libcpp.pair cimport pair from cython.operator cimport dereference as deref -from spacy.lexeme cimport Lexeme -from spacy.lexeme cimport LexID - from . import util from os import path @@ -33,7 +30,7 @@ cdef class Language: self.load_tokenization(util.read_tokenization(name)) self.load_dist_info(util.read_dist_info(name)) - cpdef Tokens tokenize(self, unicode string): + cpdef list tokenize(self, unicode string): """Tokenize. Split the string into tokens. @@ -44,8 +41,8 @@ cdef class Language: Returns: tokens (Tokens): A Tokens object. """ - cdef Lexeme** chunk - cdef Tokens tokens = Tokens(self) + cdef list chunk + cdef list tokens = [] cdef size_t length = len(string) cdef size_t start = 0 cdef size_t i = 0 @@ -53,64 +50,50 @@ cdef class Language: if _is_whitespace(c): if start < i: chunk = self.lookup_chunk(string[start:i]) - _extend(tokens, chunk) + tokens.extend(chunk) start = i + 1 i += 1 if start < i: chunk = self.lookup_chunk(string[start:]) - _extend(tokens, chunk) + tokens.extend(chunk) return tokens - cdef Lexeme* lookup(self, unicode string) except NULL: + cdef Word lookup(self, unicode string): assert len(string) != 0 - cdef Lexeme* word - cdef LexID lex_id + cdef Word word cdef StringHash h = hash(string) if h in self.vocab: - lex_id = self.vocab[h] - word = lex_id + word = self.vocab[h] else: word = self.new_lexeme(string) return word - cdef Lexeme** lookup_chunk(self, unicode string) except NULL: + cdef list lookup_chunk(self, unicode string): cdef StringHash h = hash(string) - cdef Lexeme** chunk + cdef list chunk cdef size_t chunk_id if h in self.chunks: - chunk_id = self.chunks[h] - chunk = chunk_id + chunk = self.chunks[h] else: chunk = self.new_chunk(string, self.find_substrings(string)) return chunk - cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL: - cdef Lexeme** chunk = calloc(len(substrings) + 1, sizeof(Lexeme*)) + cdef list new_chunk(self, unicode string, list substrings): + chunk = [] for i, substring in enumerate(substrings): - chunk[i] = self.lookup(substring) - chunk[i + 1] = NULL + chunk.append(self.lookup(substring)) cdef StringHash h = hash(string) - self.chunks[h] = chunk + self.chunks[h] = chunk return chunk - cdef Lexeme* new_lexeme(self, unicode string) except NULL: - cdef Lexeme* word = calloc(1, sizeof(Lexeme)) - cdef bytes byte_string = string.encode('utf8') - word.string = byte_string - word.length = len(byte_string) - word.lex = hash(string) - word.string_views = calloc(len(self.view_funcs), sizeof(StringHash)) - cdef unicode view - cdef StringHash hashed - for i, view_func in enumerate(self.view_funcs): - view = view_func(string) - hashed = hash(view) - word.string_views[i] = hashed - self.bacov[hashed] = view + cdef Word new_lexeme(self, unicode string): + string_views = [view_func(string) for view_func in self.view_funcs] + word = Word(string.encode('utf8'), string_views) self.bacov[word.lex] = string - self.vocab[word.lex] = word + self.vocab[word.lex] = word return word + """ def add_view_funcs(self, list view_funcs): self.view_funcs.extend(view_funcs) cdef size_t nr_views = len(self.view_funcs) @@ -132,6 +115,7 @@ cdef class Language: hashed = hash(view) word.string_views[i] = hashed self.bacov[hashed] = view + """ cpdef unicode unhash(self, StringHash hash_value): '''Fetch a string from the reverse index, given its hash value.''' @@ -162,7 +146,7 @@ cdef class Language: cdef int find_split(self, unicode word): return len(word) - cdef int set_orth(self, unicode string, Lexeme* word): + cdef int set_orth(self, unicode string, Word word): pass def load_tokenization(self, token_rules): @@ -190,7 +174,7 @@ cdef class Language: ''' cdef unicode string cdef dict word_dist - cdef Lexeme* w + cdef Word w for string, word_dist in dist_info.items(): w = self.lookup(string) w.prob = word_dist.prob @@ -212,9 +196,9 @@ cdef inline bint _is_whitespace(Py_UNICODE c) nogil: return False -cdef inline int _extend(Tokens tokens, Lexeme** chunk) nogil: - cdef size_t i = 0 - while chunk[i] != NULL: - tokens.vctr[0].push_back(chunk[i]) - tokens.length += 1 - i += 1 +#cdef inline int _extend(Tokens tokens, Lexeme** chunk) nogil: +# cdef size_t i = 0 +# while chunk[i] != NULL: +# tokens.vctr[0].push_back(chunk[i]) +# tokens.length += 1 +# i += 1