mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Remove English exceptions with mismatched features (#10873)
Remove English contraction exceptions with mismatched features that lead to exceptions like "theses" and "thisre".
This commit is contained in:
		
							parent
							
								
									41389ffe1e
								
							
						
					
					
						commit
						727ce6d1f5
					
				|  | @ -35,7 +35,7 @@ for pron in ["i"]: | |||
| 
 | ||||
|         _exc[orth + "m"] = [ | ||||
|             {ORTH: orth, NORM: pron}, | ||||
|             {ORTH: "m", "tenspect": 1, "number": 1}, | ||||
|             {ORTH: "m"}, | ||||
|         ] | ||||
| 
 | ||||
|         _exc[orth + "'ma"] = [ | ||||
|  | @ -139,26 +139,27 @@ for pron in ["he", "she", "it"]: | |||
| 
 | ||||
| # W-words, relative pronouns, prepositions etc. | ||||
| 
 | ||||
| for word in [ | ||||
|     "who", | ||||
|     "what", | ||||
|     "when", | ||||
|     "where", | ||||
|     "why", | ||||
|     "how", | ||||
|     "there", | ||||
|     "that", | ||||
|     "this", | ||||
|     "these", | ||||
|     "those", | ||||
| for word, morph in [ | ||||
|     ("who", None), | ||||
|     ("what", None), | ||||
|     ("when", None), | ||||
|     ("where", None), | ||||
|     ("why", None), | ||||
|     ("how", None), | ||||
|     ("there", None), | ||||
|     ("that", "Number=Sing|Person=3"), | ||||
|     ("this", "Number=Sing|Person=3"), | ||||
|     ("these", "Number=Plur|Person=3"), | ||||
|     ("those", "Number=Plur|Person=3"), | ||||
| ]: | ||||
|     for orth in [word, word.title()]: | ||||
|         _exc[orth + "'s"] = [ | ||||
|             {ORTH: orth, NORM: word}, | ||||
|             {ORTH: "'s", NORM: "'s"}, | ||||
|         ] | ||||
|         if morph != "Number=Plur|Person=3": | ||||
|             _exc[orth + "'s"] = [ | ||||
|                 {ORTH: orth, NORM: word}, | ||||
|                 {ORTH: "'s", NORM: "'s"}, | ||||
|             ] | ||||
| 
 | ||||
|         _exc[orth + "s"] = [{ORTH: orth, NORM: word}, {ORTH: "s"}] | ||||
|             _exc[orth + "s"] = [{ORTH: orth, NORM: word}, {ORTH: "s"}] | ||||
| 
 | ||||
|         _exc[orth + "'ll"] = [ | ||||
|             {ORTH: orth, NORM: word}, | ||||
|  | @ -182,25 +183,26 @@ for word in [ | |||
|             {ORTH: "ve", NORM: "have"}, | ||||
|         ] | ||||
| 
 | ||||
|         _exc[orth + "'re"] = [ | ||||
|             {ORTH: orth, NORM: word}, | ||||
|             {ORTH: "'re", NORM: "are"}, | ||||
|         ] | ||||
|         if morph != "Number=Sing|Person=3": | ||||
|             _exc[orth + "'re"] = [ | ||||
|                 {ORTH: orth, NORM: word}, | ||||
|                 {ORTH: "'re", NORM: "are"}, | ||||
|             ] | ||||
| 
 | ||||
|         _exc[orth + "re"] = [ | ||||
|             {ORTH: orth, NORM: word}, | ||||
|             {ORTH: "re", NORM: "are"}, | ||||
|         ] | ||||
|             _exc[orth + "re"] = [ | ||||
|                 {ORTH: orth, NORM: word}, | ||||
|                 {ORTH: "re", NORM: "are"}, | ||||
|             ] | ||||
| 
 | ||||
|         _exc[orth + "'ve"] = [ | ||||
|             {ORTH: orth, NORM: word}, | ||||
|             {ORTH: "'ve"}, | ||||
|         ] | ||||
|             _exc[orth + "'ve"] = [ | ||||
|                 {ORTH: orth, NORM: word}, | ||||
|                 {ORTH: "'ve"}, | ||||
|             ] | ||||
| 
 | ||||
|         _exc[orth + "ve"] = [ | ||||
|             {ORTH: orth}, | ||||
|             {ORTH: "ve", NORM: "have"}, | ||||
|         ] | ||||
|             _exc[orth + "ve"] = [ | ||||
|                 {ORTH: orth}, | ||||
|                 {ORTH: "ve", NORM: "have"}, | ||||
|             ] | ||||
| 
 | ||||
|         _exc[orth + "'d"] = [ | ||||
|             {ORTH: orth, NORM: word}, | ||||
|  |  | |||
|  | @ -167,3 +167,12 @@ def test_issue3521(en_tokenizer, word): | |||
|     tok = en_tokenizer(word)[1] | ||||
|     # 'not' and 'would' should be stopwords, also in their abbreviated forms | ||||
|     assert tok.is_stop | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.issue(10699) | ||||
| @pytest.mark.parametrize("text", ["theses", "thisre"]) | ||||
| def test_issue10699(en_tokenizer, text): | ||||
|     """Test that 'theses' and 'thisre' are excluded from the contractions | ||||
|     generated by the English tokenizer exceptions.""" | ||||
|     tokens = en_tokenizer(text) | ||||
|     assert len(tokens) == 1 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user