mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Swedish: Exceptions for single letter words ending sentence (#2615)
* Exceptions for single letter words ending sentence Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."), should be tokenized as two separate tokens. * Add test
This commit is contained in:
		
							parent
							
								
									860f5bd91f
								
							
						
					
					
						commit
						1914c488d3
					
				|  | @ -1,7 +1,7 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA | ||||
| from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA, PUNCT | ||||
| 
 | ||||
| 
 | ||||
| _exc = {} | ||||
|  | @ -78,5 +78,11 @@ for orth in [ | |||
|     "s.k.", "st.", "s:t", "t.ex.", "t.o.m.", "ung.", "äv.", "övers."]: | ||||
|     _exc[orth] = [{ORTH: orth}] | ||||
| 
 | ||||
| # Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."), | ||||
| # should be tokenized as two separate tokens. | ||||
| for orth in ["i", "m"]: | ||||
|     _exc[orth + "."] = [ | ||||
|         {ORTH: orth, LEMMA: orth, NORM: orth}, | ||||
|         {ORTH: ".", TAG: PUNCT}] | ||||
| 
 | ||||
| TOKENIZER_EXCEPTIONS = _exc | ||||
|  |  | |||
|  | @ -6,7 +6,8 @@ import pytest | |||
| 
 | ||||
| SV_TOKEN_EXCEPTION_TESTS = [ | ||||
|     ('Smörsåsen används bl.a. till fisk', ['Smörsåsen', 'används', 'bl.a.', 'till', 'fisk']), | ||||
|     ('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar']) | ||||
|     ('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar']), | ||||
|     ('Anders I. tycker om ord med i i.', ["Anders", "I.", "tycker", "om", "ord", "med", "i", "i", "."]) | ||||
| ] | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user