* Moving to Word objects in place of the Lexeme struct.

This commit is contained in:
Matthew Honnibal 2014-08-22 17:28:23 +02:00
parent 47fbd0475a
commit 782806df08
4 changed files with 43 additions and 74 deletions

View File

@ -1,23 +1,21 @@
from libcpp.vector cimport vector from libcpp.vector cimport vector
from spacy.spacy cimport StringHash from spacy.spacy cimport StringHash
from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport LexID
from spacy.lexeme cimport ClusterID
from spacy.spacy cimport Language from spacy.spacy cimport Language
from spacy.word cimport Word
from spacy.tokens cimport Tokens from spacy.tokens cimport Tokens
cimport cython cimport cython
cdef class English(spacy.Language): cdef class English(spacy.Language):
cdef int find_split(self, unicode word) cdef int find_split(self, unicode word)
cdef int set_orth(self, unicode word, Lexeme* lex) except -1 cdef int set_orth(self, unicode word, Word lex) except -1
cdef English EN cdef English EN
cpdef LexID lookup(unicode word) except 0 cpdef Word lookup(unicode word)
cpdef Tokens tokenize(unicode string) cpdef list tokenize(unicode string)
cpdef unicode unhash(StringHash hash_value) cpdef unicode unhash(StringHash hash_value)

View File

@ -45,7 +45,6 @@ cimport spacy
from spacy.orthography.latin cimport * from spacy.orthography.latin cimport *
from spacy.lexeme cimport *
from .orthography.latin import * from .orthography.latin import *
from .lexeme import * from .lexeme import *
@ -96,7 +95,7 @@ cdef bint check_punct(unicode word, size_t i, size_t length):
EN = English('en') EN = English('en')
cpdef Tokens tokenize(unicode string): cpdef list tokenize(unicode string):
"""Tokenize a string. """Tokenize a string.
The tokenization rules are defined in two places: The tokenization rules are defined in two places:
@ -113,7 +112,7 @@ cpdef Tokens tokenize(unicode string):
return EN.tokenize(string) return EN.tokenize(string)
cpdef LexID lookup(unicode string) except 0: cpdef Word lookup(unicode string):
"""Retrieve (or create, if not found) a Lexeme for a string, and return its ID. """Retrieve (or create, if not found) a Lexeme for a string, and return its ID.
Properties of the Lexeme are accessed by passing LexID to the accessor methods. Properties of the Lexeme are accessed by passing LexID to the accessor methods.
@ -125,7 +124,7 @@ cpdef LexID lookup(unicode string) except 0:
Returns: Returns:
lexeme (LexID): A reference to a lexical type. lexeme (LexID): A reference to a lexical type.
""" """
return <LexID>EN.lookup(string) return EN.lookup(string)
cpdef unicode unhash(StringHash hash_value): cpdef unicode unhash(StringHash hash_value):

View File

@ -1,21 +1,9 @@
from libcpp.vector cimport vector
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t
from libc.stdint cimport uint64_t from libc.stdint cimport uint64_t
from spacy.word cimport Word
# Circular import problems here
ctypedef size_t Lexeme_addr
ctypedef uint32_t StringHash ctypedef uint32_t StringHash
from spacy.lexeme cimport Lexeme
from spacy.tokens cimport Tokens
# Put these above import to avoid circular import problem
ctypedef char Bits8
ctypedef uint64_t Bits64
ctypedef int ClusterID
from spacy.lexeme cimport Lexeme
cdef class Language: cdef class Language:
@ -24,16 +12,16 @@ cdef class Language:
cdef dict vocab cdef dict vocab
cdef dict bacov cdef dict bacov
cpdef Tokens tokenize(self, unicode text) cpdef list tokenize(self, unicode text)
cdef Lexeme* lookup(self, unicode string) except NULL cdef Word lookup(self, unicode string)
cdef Lexeme** lookup_chunk(self, unicode chunk) except NULL cdef list lookup_chunk(self, unicode chunk)
cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL cdef list new_chunk(self, unicode string, list substrings)
cdef Lexeme* new_lexeme(self, unicode lex) except NULL cdef Word new_lexeme(self, unicode lex)
cpdef unicode unhash(self, StringHash hashed) cpdef unicode unhash(self, StringHash hashed)
cpdef list find_substrings(self, unicode chunk) cpdef list find_substrings(self, unicode chunk)
cdef int find_split(self, unicode word) cdef int find_split(self, unicode word)
cdef int set_orth(self, unicode string, Lexeme* word) cdef int set_orth(self, unicode string, Word word)

View File

@ -14,9 +14,6 @@ from libc.stdlib cimport calloc, free
from libcpp.pair cimport pair from libcpp.pair cimport pair
from cython.operator cimport dereference as deref from cython.operator cimport dereference as deref
from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport LexID
from . import util from . import util
from os import path from os import path
@ -33,7 +30,7 @@ cdef class Language:
self.load_tokenization(util.read_tokenization(name)) self.load_tokenization(util.read_tokenization(name))
self.load_dist_info(util.read_dist_info(name)) self.load_dist_info(util.read_dist_info(name))
cpdef Tokens tokenize(self, unicode string): cpdef list tokenize(self, unicode string):
"""Tokenize. """Tokenize.
Split the string into tokens. Split the string into tokens.
@ -44,8 +41,8 @@ cdef class Language:
Returns: Returns:
tokens (Tokens): A Tokens object. tokens (Tokens): A Tokens object.
""" """
cdef Lexeme** chunk cdef list chunk
cdef Tokens tokens = Tokens(self) cdef list tokens = []
cdef size_t length = len(string) cdef size_t length = len(string)
cdef size_t start = 0 cdef size_t start = 0
cdef size_t i = 0 cdef size_t i = 0
@ -53,64 +50,50 @@ cdef class Language:
if _is_whitespace(c): if _is_whitespace(c):
if start < i: if start < i:
chunk = self.lookup_chunk(string[start:i]) chunk = self.lookup_chunk(string[start:i])
_extend(tokens, chunk) tokens.extend(chunk)
start = i + 1 start = i + 1
i += 1 i += 1
if start < i: if start < i:
chunk = self.lookup_chunk(string[start:]) chunk = self.lookup_chunk(string[start:])
_extend(tokens, chunk) tokens.extend(chunk)
return tokens return tokens
cdef Lexeme* lookup(self, unicode string) except NULL: cdef Word lookup(self, unicode string):
assert len(string) != 0 assert len(string) != 0
cdef Lexeme* word cdef Word word
cdef LexID lex_id
cdef StringHash h = hash(string) cdef StringHash h = hash(string)
if h in self.vocab: if h in self.vocab:
lex_id = self.vocab[h] word = self.vocab[h]
word = <Lexeme*>lex_id
else: else:
word = self.new_lexeme(string) word = self.new_lexeme(string)
return word return word
cdef Lexeme** lookup_chunk(self, unicode string) except NULL: cdef list lookup_chunk(self, unicode string):
cdef StringHash h = hash(string) cdef StringHash h = hash(string)
cdef Lexeme** chunk cdef list chunk
cdef size_t chunk_id cdef size_t chunk_id
if h in self.chunks: if h in self.chunks:
chunk_id = self.chunks[h] chunk = self.chunks[h]
chunk = <Lexeme**>chunk_id
else: else:
chunk = self.new_chunk(string, self.find_substrings(string)) chunk = self.new_chunk(string, self.find_substrings(string))
return chunk return chunk
cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL: cdef list new_chunk(self, unicode string, list substrings):
cdef Lexeme** chunk = <Lexeme**>calloc(len(substrings) + 1, sizeof(Lexeme*)) chunk = []
for i, substring in enumerate(substrings): for i, substring in enumerate(substrings):
chunk[i] = self.lookup(substring) chunk.append(self.lookup(substring))
chunk[i + 1] = NULL
cdef StringHash h = hash(string) cdef StringHash h = hash(string)
self.chunks[h] = <size_t>chunk self.chunks[h] = chunk
return chunk return chunk
cdef Lexeme* new_lexeme(self, unicode string) except NULL: cdef Word new_lexeme(self, unicode string):
cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme)) string_views = [view_func(string) for view_func in self.view_funcs]
cdef bytes byte_string = string.encode('utf8') word = Word(string.encode('utf8'), string_views)
word.string = <char*>byte_string
word.length = len(byte_string)
word.lex = hash(string)
word.string_views = <StringHash*>calloc(len(self.view_funcs), sizeof(StringHash))
cdef unicode view
cdef StringHash hashed
for i, view_func in enumerate(self.view_funcs):
view = view_func(string)
hashed = hash(view)
word.string_views[i] = hashed
self.bacov[hashed] = view
self.bacov[word.lex] = string self.bacov[word.lex] = string
self.vocab[word.lex] = <LexID>word self.vocab[word.lex] = word
return word return word
"""
def add_view_funcs(self, list view_funcs): def add_view_funcs(self, list view_funcs):
self.view_funcs.extend(view_funcs) self.view_funcs.extend(view_funcs)
cdef size_t nr_views = len(self.view_funcs) cdef size_t nr_views = len(self.view_funcs)
@ -132,6 +115,7 @@ cdef class Language:
hashed = hash(view) hashed = hash(view)
word.string_views[i] = hashed word.string_views[i] = hashed
self.bacov[hashed] = view self.bacov[hashed] = view
"""
cpdef unicode unhash(self, StringHash hash_value): cpdef unicode unhash(self, StringHash hash_value):
'''Fetch a string from the reverse index, given its hash value.''' '''Fetch a string from the reverse index, given its hash value.'''
@ -162,7 +146,7 @@ cdef class Language:
cdef int find_split(self, unicode word): cdef int find_split(self, unicode word):
return len(word) return len(word)
cdef int set_orth(self, unicode string, Lexeme* word): cdef int set_orth(self, unicode string, Word word):
pass pass
def load_tokenization(self, token_rules): def load_tokenization(self, token_rules):
@ -190,7 +174,7 @@ cdef class Language:
''' '''
cdef unicode string cdef unicode string
cdef dict word_dist cdef dict word_dist
cdef Lexeme* w cdef Word w
for string, word_dist in dist_info.items(): for string, word_dist in dist_info.items():
w = self.lookup(string) w = self.lookup(string)
w.prob = word_dist.prob w.prob = word_dist.prob
@ -212,9 +196,9 @@ cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
return False return False
cdef inline int _extend(Tokens tokens, Lexeme** chunk) nogil: #cdef inline int _extend(Tokens tokens, Lexeme** chunk) nogil:
cdef size_t i = 0 # cdef size_t i = 0
while chunk[i] != NULL: # while chunk[i] != NULL:
tokens.vctr[0].push_back(<Lexeme_addr>chunk[i]) # tokens.vctr[0].push_back(<Lexeme_addr>chunk[i])
tokens.length += 1 # tokens.length += 1
i += 1 # i += 1