diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 8e64ca828..05dd390d4 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -20,7 +20,7 @@ cdef class Language: self.name = name self.cache = {} self.lexicon = Lexicon() - #self.load_special_tokenization(util.read_tokenization(name)) + self.load_special_tokenization(util.read_tokenization(name)) cpdef list tokenize(self, unicode string): """Tokenize a string. @@ -49,6 +49,7 @@ cdef class Language: i += 1 if start < i: tokens.extend(self._tokenize(string[start:])) + assert tokens return tokens cdef list _tokenize(self, unicode string): @@ -101,7 +102,7 @@ cdef class Language: for string, substrings in token_rules: lexemes = [] for i, substring in enumerate(substrings): - lexemes.append(self.lookup(substring)) + lexemes.append(self.lexicon.lookup(substring)) self.cache[string] = lexemes @@ -143,13 +144,15 @@ cdef class Lexicon: cdef Lexeme word flag_id = len(self.flag_checkers) for string, word in self.lexicon.items(): - if flag_checker(string, word.prob, {}): + if flag_checker(string, word.prob, {}, {}): word.set_flag(flag_id) self.flag_checkers.append(flag_checker) return flag_id def add_transform(self, string_transform): self.string_transformers.append(string_transform) + for string, word in self.lexicon.items(): + word.add_view(string_transform(string, word.prob, {}, {})) return len(self.string_transformers) - 1 def load_probs(self, location): diff --git a/tests/test_contractions.py b/tests/test_contractions.py index 82f975b27..a3e89cb67 100644 --- a/tests/test_contractions.py +++ b/tests/test_contractions.py @@ -1,41 +1,41 @@ from __future__ import unicode_literals -from spacy.en import tokenize, lookup, unhash +from spacy.en import EN def test_possess(): - tokens = tokenize("Mike's") - assert unhash(tokens[0].lex) == "Mike" - assert unhash(tokens[1].lex) == "'s" + tokens = EN.tokenize("Mike's") + assert tokens[0].string == "Mike" + assert tokens[1].string == "'s" assert len(tokens) == 2 def test_apostrophe(): - tokens = tokenize("schools'") + tokens = EN.tokenize("schools'") assert len(tokens) == 2 - assert unhash(tokens[1].lex) == "'" - assert unhash(tokens[0].lex) == "schools" + assert tokens[1].string == "'" + assert tokens[0].string == "schools" def test_LL(): - tokens = tokenize("we'll") + tokens = EN.tokenize("we'll") assert len(tokens) == 2 - assert unhash(tokens[1].lex) == "will" - assert unhash(tokens[0].lex) == "we" + assert tokens[1].string == "will" + assert tokens[0].string == "we" def test_aint(): - tokens = tokenize("ain't") + tokens = EN.tokenize("ain't") assert len(tokens) == 2 - assert unhash(tokens[0].lex) == "are" - assert unhash(tokens[1].lex) == "not" + assert tokens[0].string == "are" + assert tokens[1].string == "not" def test_capitalized(): - tokens = tokenize("can't") + tokens = EN.tokenize("can't") assert len(tokens) == 2 - tokens = tokenize("Can't") + tokens = EN.tokenize("Can't") assert len(tokens) == 2 - tokens = tokenize("Ain't") + tokens = EN.tokenize("Ain't") assert len(tokens) == 2 - assert unhash(tokens[0].lex) == "Are" + assert tokens[0].string == "Are"