* Fix Issue #201: Tokenization of there'll

This commit is contained in:
Matthew Honnibal 2015-12-29 18:09:09 +01:00
parent f5dea1406d
commit 4b4eec8b47
3 changed files with 69 additions and 22 deletions

View File

@ -90,7 +90,7 @@ starting_tokens = {
"she": {"'d": ["contrLower", "contrUpper"], "'d've": [], "'ll": ["contrLower", "contrUpper"], "'s": []}, "she": {"'d": ["contrLower", "contrUpper"], "'d've": [], "'ll": ["contrLower", "contrUpper"], "'s": []},
"should": {"'ve": [], "n't": [], "n't've": []}, "should": {"'ve": [], "n't": [], "n't've": []},
"that": {"'s": []}, "that": {"'s": []},
"there": {"'d": [], "'d've": [], "'s": ["contrLower", "contrUpper"]}, "there": {"'d": [], "'d've": [], "'s": ["contrLower", "contrUpper"], "'ll": []},
"they": {"'d": [], "'d've": [], "'ll": [], "'re": [], "'ve": []}, "they": {"'d": [], "'d've": [], "'ll": [], "'re": [], "'ve": []},
"was": {"n't": []}, "was": {"n't": []},
"we": {"'d": ["contrLower", "contrUpper"], "'d've": [], "'ll": ["contrLower", "contrUpper"], "'re": ["contrLower", "contrUpper"], "'ve": []}, "we": {"'d": ["contrLower", "contrUpper"], "'d've": [], "'ll": ["contrLower", "contrUpper"], "'re": ["contrLower", "contrUpper"], "'ve": []},

View File

@ -40,12 +40,14 @@
"pos": "VB" "pos": "VB"
} }
], ],
"10a.m.": [ "There'll": [
{ {
"F": "10" "F": "There"
}, },
{ {
"F": "a.m." "F": "'ll",
"L": "will",
"pos": "MD"
} }
], ],
"E.G.": [ "E.G.": [
@ -134,9 +136,12 @@
"F": ":0" "F": ":0"
} }
], ],
":)": [ "10a.m.": [
{ {
"F": ":)" "F": "10"
},
{
"F": "a.m."
} }
], ],
"aint": [ "aint": [
@ -796,6 +801,15 @@
"F": "'s" "F": "'s"
} }
], ],
"2pm": [
{
"F": "2"
},
{
"L": "p.m.",
"F": "pm"
}
],
"Who'll": [ "Who'll": [
{ {
"F": "Who" "F": "Who"
@ -976,13 +990,9 @@
"pos": "VB" "pos": "VB"
} }
], ],
"2pm": [ ":)": [
{ {
"F": "2" "F": ":)"
},
{
"L": "p.m.",
"F": "pm"
} }
], ],
"o.O": [ "o.O": [
@ -998,6 +1008,11 @@
"F": "s" "F": "s"
} }
], ],
":((": [
{
"F": ":(("
}
],
"Gov.": [ "Gov.": [
{ {
"F": "Gov." "F": "Gov."
@ -2306,6 +2321,16 @@
"pos": "VB" "pos": "VB"
} }
], ],
"therell": [
{
"F": "there"
},
{
"F": "ll",
"L": "will",
"pos": "MD"
}
],
"might've": [ "might've": [
{ {
"F": "might" "F": "might"
@ -2890,9 +2915,19 @@
"pos": "VB" "pos": "VB"
} }
], ],
":((": [ "Theredve": [
{ {
"F": ":((" "F": "There"
},
{
"F": "d",
"L": "would",
"pos": "MD"
},
{
"F": "ve",
"L": "have",
"pos": "VB"
} }
], ],
"theredve": [ "theredve": [
@ -3274,6 +3309,16 @@
"F": "o." "F": "o."
} }
], ],
"there'll": [
{
"F": "there"
},
{
"F": "'ll",
"L": "will",
"pos": "MD"
}
],
":]": [ ":]": [
{ {
"F": ":]" "F": ":]"
@ -4561,19 +4606,14 @@
"pos": "RB" "pos": "RB"
} }
], ],
"Theredve": [ "Therell": [
{ {
"F": "There" "F": "There"
}, },
{ {
"F": "d", "F": "ll",
"L": "would", "L": "will",
"pos": "MD" "pos": "MD"
},
{
"F": "ve",
"L": "have",
"pos": "VB"
} }
], ],
"shan't": [ "shan't": [

View File

@ -48,3 +48,10 @@ def test_punct(en_tokenizer):
assert len(tokens) == 2 assert len(tokens) == 2
tokens = en_tokenizer("``We've") tokens = en_tokenizer("``We've")
assert len(tokens) == 3 assert len(tokens) == 3
def test_therell(en_tokenizer):
tokens = en_tokenizer("there'll")
assert len(tokens) == 2
assert tokens[0].text == "there"
assert tokens[1].text == "there"