* Moving to Word objects in place of the Lexeme struct.

This commit is contained in:
Matthew Honnibal 2014-08-22 17:32:16 +02:00
parent 782806df08
commit 4f01df9152

View File

@ -11,8 +11,6 @@ Special-case tokenization rules are read from data/<lang>/tokenization .
from __future__ import unicode_literals from __future__ import unicode_literals
from libc.stdlib cimport calloc, free from libc.stdlib cimport calloc, free
from libcpp.pair cimport pair
from cython.operator cimport dereference as deref
from . import util from . import util
from os import path from os import path
@ -61,19 +59,17 @@ cdef class Language:
cdef Word lookup(self, unicode string): cdef Word lookup(self, unicode string):
assert len(string) != 0 assert len(string) != 0
cdef Word word cdef Word word
cdef StringHash h = hash(string) if string in self.vocab:
if h in self.vocab: word = self.vocab[string]
word = self.vocab[h]
else: else:
word = self.new_lexeme(string) word = self.new_lexeme(string)
return word return word
cdef list lookup_chunk(self, unicode string): cdef list lookup_chunk(self, unicode string):
cdef StringHash h = hash(string)
cdef list chunk cdef list chunk
cdef size_t chunk_id cdef size_t chunk_id
if h in self.chunks: if string in self.chunks:
chunk = self.chunks[h] chunk = self.chunks[string]
else: else:
chunk = self.new_chunk(string, self.find_substrings(string)) chunk = self.new_chunk(string, self.find_substrings(string))
return chunk return chunk
@ -82,15 +78,14 @@ cdef class Language:
chunk = [] chunk = []
for i, substring in enumerate(substrings): for i, substring in enumerate(substrings):
chunk.append(self.lookup(substring)) chunk.append(self.lookup(substring))
cdef StringHash h = hash(string) self.chunks[string] = chunk
self.chunks[h] = chunk
return chunk return chunk
cdef Word new_lexeme(self, unicode string): cdef Word new_lexeme(self, unicode string):
string_views = [view_func(string) for view_func in self.view_funcs] string_views = [view_func(string) for view_func in self.view_funcs]
word = Word(string.encode('utf8'), string_views) word = Word(string.encode('utf8'), string_views)
self.bacov[word.lex] = string self.bacov[word.lex] = string
self.vocab[word.lex] = word self.vocab[string] = word
return word return word
""" """