From 4f01df91527a8279a0dc0473bc1f576a408e9e1f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 22 Aug 2014 17:32:16 +0200 Subject: [PATCH] * Moving to Word objects in place of the Lexeme struct. --- spacy/spacy.pyx | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx index 10f89a2ed..64bac2941 100644 --- a/spacy/spacy.pyx +++ b/spacy/spacy.pyx @@ -11,8 +11,6 @@ Special-case tokenization rules are read from data//tokenization . from __future__ import unicode_literals from libc.stdlib cimport calloc, free -from libcpp.pair cimport pair -from cython.operator cimport dereference as deref from . import util from os import path @@ -61,19 +59,17 @@ cdef class Language: cdef Word lookup(self, unicode string): assert len(string) != 0 cdef Word word - cdef StringHash h = hash(string) - if h in self.vocab: - word = self.vocab[h] + if string in self.vocab: + word = self.vocab[string] else: word = self.new_lexeme(string) return word cdef list lookup_chunk(self, unicode string): - cdef StringHash h = hash(string) cdef list chunk cdef size_t chunk_id - if h in self.chunks: - chunk = self.chunks[h] + if string in self.chunks: + chunk = self.chunks[string] else: chunk = self.new_chunk(string, self.find_substrings(string)) return chunk @@ -82,15 +78,14 @@ cdef class Language: chunk = [] for i, substring in enumerate(substrings): chunk.append(self.lookup(substring)) - cdef StringHash h = hash(string) - self.chunks[h] = chunk + self.chunks[string] = chunk return chunk cdef Word new_lexeme(self, unicode string): string_views = [view_func(string) for view_func in self.view_funcs] word = Word(string.encode('utf8'), string_views) self.bacov[word.lex] = string - self.vocab[word.lex] = word + self.vocab[string] = word return word """