From 4b4eec8b47cf320d823fd9c6de57b7e02436553a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 29 Dec 2015 18:09:09 +0100 Subject: [PATCH] * Fix Issue #201: Tokenization of there'll --- lang_data/en/generate_specials.py | 2 +- lang_data/en/specials.json | 82 ++++++++++++++++------ spacy/tests/tokenizer/test_contractions.py | 7 ++ 3 files changed, 69 insertions(+), 22 deletions(-) diff --git a/lang_data/en/generate_specials.py b/lang_data/en/generate_specials.py index 2d0fb2d14..5390a7cea 100644 --- a/lang_data/en/generate_specials.py +++ b/lang_data/en/generate_specials.py @@ -90,7 +90,7 @@ starting_tokens = { "she": {"'d": ["contrLower", "contrUpper"], "'d've": [], "'ll": ["contrLower", "contrUpper"], "'s": []}, "should": {"'ve": [], "n't": [], "n't've": []}, "that": {"'s": []}, - "there": {"'d": [], "'d've": [], "'s": ["contrLower", "contrUpper"]}, + "there": {"'d": [], "'d've": [], "'s": ["contrLower", "contrUpper"], "'ll": []}, "they": {"'d": [], "'d've": [], "'ll": [], "'re": [], "'ve": []}, "was": {"n't": []}, "we": {"'d": ["contrLower", "contrUpper"], "'d've": [], "'ll": ["contrLower", "contrUpper"], "'re": ["contrLower", "contrUpper"], "'ve": []}, diff --git a/lang_data/en/specials.json b/lang_data/en/specials.json index a3bf4804a..3600717ad 100644 --- a/lang_data/en/specials.json +++ b/lang_data/en/specials.json @@ -40,12 +40,14 @@ "pos": "VB" } ], - "10a.m.": [ + "There'll": [ { - "F": "10" + "F": "There" }, { - "F": "a.m." + "F": "'ll", + "L": "will", + "pos": "MD" } ], "E.G.": [ @@ -134,9 +136,12 @@ "F": ":0" } ], - ":)": [ + "10a.m.": [ { - "F": ":)" + "F": "10" + }, + { + "F": "a.m." } ], "aint": [ @@ -796,6 +801,15 @@ "F": "'s" } ], + "2pm": [ + { + "F": "2" + }, + { + "L": "p.m.", + "F": "pm" + } + ], "Who'll": [ { "F": "Who" @@ -976,13 +990,9 @@ "pos": "VB" } ], - "2pm": [ + ":)": [ { - "F": "2" - }, - { - "L": "p.m.", - "F": "pm" + "F": ":)" } ], "o.O": [ @@ -998,6 +1008,11 @@ "F": "s" } ], + ":((": [ + { + "F": ":((" + } + ], "Gov.": [ { "F": "Gov." @@ -2306,6 +2321,16 @@ "pos": "VB" } ], + "therell": [ + { + "F": "there" + }, + { + "F": "ll", + "L": "will", + "pos": "MD" + } + ], "might've": [ { "F": "might" @@ -2890,9 +2915,19 @@ "pos": "VB" } ], - ":((": [ + "Theredve": [ { - "F": ":((" + "F": "There" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" } ], "theredve": [ @@ -3274,6 +3309,16 @@ "F": "o." } ], + "there'll": [ + { + "F": "there" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], ":]": [ { "F": ":]" @@ -4561,19 +4606,14 @@ "pos": "RB" } ], - "Theredve": [ + "Therell": [ { "F": "There" }, { - "F": "d", - "L": "would", + "F": "ll", + "L": "will", "pos": "MD" - }, - { - "F": "ve", - "L": "have", - "pos": "VB" } ], "shan't": [ diff --git a/spacy/tests/tokenizer/test_contractions.py b/spacy/tests/tokenizer/test_contractions.py index ea93ff8b4..afb1b1fe5 100644 --- a/spacy/tests/tokenizer/test_contractions.py +++ b/spacy/tests/tokenizer/test_contractions.py @@ -48,3 +48,10 @@ def test_punct(en_tokenizer): assert len(tokens) == 2 tokens = en_tokenizer("``We've") assert len(tokens) == 3 + + +def test_therell(en_tokenizer): + tokens = en_tokenizer("there'll") + assert len(tokens) == 2 + assert tokens[0].text == "there" + assert tokens[1].text == "there"