* Generalize tokenization rules to capitals

This commit is contained in:
Matthew Honnibal 2014-07-07 05:07:21 +02:00
parent df0458001d
commit 25849fc926
2 changed files with 15 additions and 0 deletions

View File

@ -36,6 +36,11 @@ def read_tokenization(lang):
assert chunk not in seen, chunk
seen.add(chunk)
entries.append((chunk, lex, pieces))
if chunk[0].isalpha() and chunk[0].islower():
chunk = chunk[0].title() + chunk[1:]
lex = lex[0].title() + lex[1:]
seen.add(chunk)
entries.append((chunk, lex, pieces))
return entries

View File

@ -32,3 +32,13 @@ def test_aint():
assert len(tokens) == 2
assert unhash(lex_of(tokens[0])) == "are"
assert unhash(lex_of(tokens[1])) == "not"
def test_capitalized():
tokens = expand_chunk(lookup("can't"))
assert len(tokens) == 2
tokens = expand_chunk(lookup("Can't"))
assert len(tokens) == 2
tokens = expand_chunk(lookup("Ain't"))
assert len(tokens) == 2
assert unhash(lex_of(tokens[0])) == "Are"