* Fixed contraction tests. Need to correct problem with the way case stats and tag stats are supposed to work.

This commit is contained in:
Matthew Honnibal 2014-08-27 20:22:33 +02:00
parent fdaf24604a
commit fd4e61e58b
2 changed files with 23 additions and 20 deletions

View File

@ -20,7 +20,7 @@ cdef class Language:
self.name = name self.name = name
self.cache = {} self.cache = {}
self.lexicon = Lexicon() self.lexicon = Lexicon()
#self.load_special_tokenization(util.read_tokenization(name)) self.load_special_tokenization(util.read_tokenization(name))
cpdef list tokenize(self, unicode string): cpdef list tokenize(self, unicode string):
"""Tokenize a string. """Tokenize a string.
@ -49,6 +49,7 @@ cdef class Language:
i += 1 i += 1
if start < i: if start < i:
tokens.extend(self._tokenize(string[start:])) tokens.extend(self._tokenize(string[start:]))
assert tokens
return tokens return tokens
cdef list _tokenize(self, unicode string): cdef list _tokenize(self, unicode string):
@ -101,7 +102,7 @@ cdef class Language:
for string, substrings in token_rules: for string, substrings in token_rules:
lexemes = [] lexemes = []
for i, substring in enumerate(substrings): for i, substring in enumerate(substrings):
lexemes.append(self.lookup(substring)) lexemes.append(self.lexicon.lookup(substring))
self.cache[string] = lexemes self.cache[string] = lexemes
@ -143,13 +144,15 @@ cdef class Lexicon:
cdef Lexeme word cdef Lexeme word
flag_id = len(self.flag_checkers) flag_id = len(self.flag_checkers)
for string, word in self.lexicon.items(): for string, word in self.lexicon.items():
if flag_checker(string, word.prob, {}): if flag_checker(string, word.prob, {}, {}):
word.set_flag(flag_id) word.set_flag(flag_id)
self.flag_checkers.append(flag_checker) self.flag_checkers.append(flag_checker)
return flag_id return flag_id
def add_transform(self, string_transform): def add_transform(self, string_transform):
self.string_transformers.append(string_transform) self.string_transformers.append(string_transform)
for string, word in self.lexicon.items():
word.add_view(string_transform(string, word.prob, {}, {}))
return len(self.string_transformers) - 1 return len(self.string_transformers) - 1
def load_probs(self, location): def load_probs(self, location):

View File

@ -1,41 +1,41 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy.en import tokenize, lookup, unhash from spacy.en import EN
def test_possess(): def test_possess():
tokens = tokenize("Mike's") tokens = EN.tokenize("Mike's")
assert unhash(tokens[0].lex) == "Mike" assert tokens[0].string == "Mike"
assert unhash(tokens[1].lex) == "'s" assert tokens[1].string == "'s"
assert len(tokens) == 2 assert len(tokens) == 2
def test_apostrophe(): def test_apostrophe():
tokens = tokenize("schools'") tokens = EN.tokenize("schools'")
assert len(tokens) == 2 assert len(tokens) == 2
assert unhash(tokens[1].lex) == "'" assert tokens[1].string == "'"
assert unhash(tokens[0].lex) == "schools" assert tokens[0].string == "schools"
def test_LL(): def test_LL():
tokens = tokenize("we'll") tokens = EN.tokenize("we'll")
assert len(tokens) == 2 assert len(tokens) == 2
assert unhash(tokens[1].lex) == "will" assert tokens[1].string == "will"
assert unhash(tokens[0].lex) == "we" assert tokens[0].string == "we"
def test_aint(): def test_aint():
tokens = tokenize("ain't") tokens = EN.tokenize("ain't")
assert len(tokens) == 2 assert len(tokens) == 2
assert unhash(tokens[0].lex) == "are" assert tokens[0].string == "are"
assert unhash(tokens[1].lex) == "not" assert tokens[1].string == "not"
def test_capitalized(): def test_capitalized():
tokens = tokenize("can't") tokens = EN.tokenize("can't")
assert len(tokens) == 2 assert len(tokens) == 2
tokens = tokenize("Can't") tokens = EN.tokenize("Can't")
assert len(tokens) == 2 assert len(tokens) == 2
tokens = tokenize("Ain't") tokens = EN.tokenize("Ain't")
assert len(tokens) == 2 assert len(tokens) == 2
assert unhash(tokens[0].lex) == "Are" assert tokens[0].string == "Are"