* Add tokenizer rule to fix numeric range tokenization

This commit is contained in:
Matthew Honnibal 2015-10-17 15:49:51 +11:00
parent 3ba66f2dc7
commit 454c1996d0
2 changed files with 5 additions and 0 deletions

View File

@ -1,3 +1,4 @@
\.\.\.
(?<=[a-z])\.(?=[A-Z])
(?<=[a-zA-Z])-(?=[a-zA-z])
(?<=[0-9])-(?=[0-9])

View File

@ -7,6 +7,10 @@ def test_hyphen(en_tokenizer):
assert len(tokens) == 3
def test_numeric_range(en_tokenizer):
tokens = en_tokenizer('0.1-13.5')
assert len(tokens) == 3
def test_period(en_tokenizer):
tokens = en_tokenizer('best.Known')
assert len(tokens) == 3