* Fix Issue #201: Tokenization of there'll

This commit is contained in:
Matthew Honnibal 2015-12-29 18:09:09 +01:00
parent f5dea1406d
commit 4b4eec8b47
3 changed files with 69 additions and 22 deletions

View File

@ -90,7 +90,7 @@ starting_tokens = {
"she": {"'d": ["contrLower", "contrUpper"], "'d've": [], "'ll": ["contrLower", "contrUpper"], "'s": []},
"should": {"'ve": [], "n't": [], "n't've": []},
"that": {"'s": []},
"there": {"'d": [], "'d've": [], "'s": ["contrLower", "contrUpper"]},
"there": {"'d": [], "'d've": [], "'s": ["contrLower", "contrUpper"], "'ll": []},
"they": {"'d": [], "'d've": [], "'ll": [], "'re": [], "'ve": []},
"was": {"n't": []},
"we": {"'d": ["contrLower", "contrUpper"], "'d've": [], "'ll": ["contrLower", "contrUpper"], "'re": ["contrLower", "contrUpper"], "'ve": []},

View File

@ -40,12 +40,14 @@
"pos": "VB"
}
],
"10a.m.": [
"There'll": [
{
"F": "10"
"F": "There"
},
{
"F": "a.m."
"F": "'ll",
"L": "will",
"pos": "MD"
}
],
"E.G.": [
@ -134,9 +136,12 @@
"F": ":0"
}
],
":)": [
"10a.m.": [
{
"F": ":)"
"F": "10"
},
{
"F": "a.m."
}
],
"aint": [
@ -796,6 +801,15 @@
"F": "'s"
}
],
"2pm": [
{
"F": "2"
},
{
"L": "p.m.",
"F": "pm"
}
],
"Who'll": [
{
"F": "Who"
@ -976,13 +990,9 @@
"pos": "VB"
}
],
"2pm": [
":)": [
{
"F": "2"
},
{
"L": "p.m.",
"F": "pm"
"F": ":)"
}
],
"o.O": [
@ -998,6 +1008,11 @@
"F": "s"
}
],
":((": [
{
"F": ":(("
}
],
"Gov.": [
{
"F": "Gov."
@ -2306,6 +2321,16 @@
"pos": "VB"
}
],
"therell": [
{
"F": "there"
},
{
"F": "ll",
"L": "will",
"pos": "MD"
}
],
"might've": [
{
"F": "might"
@ -2890,9 +2915,19 @@
"pos": "VB"
}
],
":((": [
"Theredve": [
{
"F": ":(("
"F": "There"
},
{
"F": "d",
"L": "would",
"pos": "MD"
},
{
"F": "ve",
"L": "have",
"pos": "VB"
}
],
"theredve": [
@ -3274,6 +3309,16 @@
"F": "o."
}
],
"there'll": [
{
"F": "there"
},
{
"F": "'ll",
"L": "will",
"pos": "MD"
}
],
":]": [
{
"F": ":]"
@ -4561,19 +4606,14 @@
"pos": "RB"
}
],
"Theredve": [
"Therell": [
{
"F": "There"
},
{
"F": "d",
"L": "would",
"F": "ll",
"L": "will",
"pos": "MD"
},
{
"F": "ve",
"L": "have",
"pos": "VB"
}
],
"shan't": [

View File

@ -48,3 +48,10 @@ def test_punct(en_tokenizer):
assert len(tokens) == 2
tokens = en_tokenizer("``We've")
assert len(tokens) == 3
def test_therell(en_tokenizer):
tokens = en_tokenizer("there'll")
assert len(tokens) == 2
assert tokens[0].text == "there"
assert tokens[1].text == "there"