mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Improve Italian tokenization (#5204)
Improve Italian tokenization for UD_Italian-ISDT.
This commit is contained in:
		
							parent
							
								
									923a453449
								
							
						
					
					
						commit
						1a944e5976
					
				|  | @ -4,7 +4,7 @@ from __future__ import unicode_literals | |||
| from .stop_words import STOP_WORDS | ||||
| from .tag_map import TAG_MAP | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from .punctuation import TOKENIZER_INFIXES | ||||
| from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES | ||||
| 
 | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ..norm_exceptions import BASE_NORMS | ||||
|  | @ -22,6 +22,7 @@ class ItalianDefaults(Language.Defaults): | |||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||
|     stop_words = STOP_WORDS | ||||
|     tag_map = TAG_MAP | ||||
|     prefixes = TOKENIZER_PREFIXES | ||||
|     infixes = TOKENIZER_INFIXES | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,15 +1,39 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from ..punctuation import TOKENIZER_INFIXES | ||||
| from ..char_classes import ALPHA | ||||
| from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES | ||||
| from ..char_classes import LIST_ELLIPSES, LIST_ICONS | ||||
| from ..char_classes import ALPHA, HYPHENS, CONCAT_QUOTES | ||||
| from ..char_classes import ALPHA_LOWER, ALPHA_UPPER | ||||
| 
 | ||||
| 
 | ||||
| ELISION = " ' ’ ".strip().replace(" ", "") | ||||
| ELISION = "'’" | ||||
| 
 | ||||
| 
 | ||||
| _infixes = TOKENIZER_INFIXES + [ | ||||
|     r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION) | ||||
| ] | ||||
| _prefixes = ( | ||||
|     [ | ||||
|         r"'[0-9][0-9]", | ||||
|         r"[0-9]+°", | ||||
| 
 | ||||
|     ] | ||||
|     + TOKENIZER_PREFIXES | ||||
| ) | ||||
| 
 | ||||
| 
 | ||||
| _infixes = ( | ||||
|     LIST_ELLIPSES | ||||
|     + LIST_ICONS | ||||
|     + [ | ||||
|         r"(?<=[0-9])[+\-\*^](?=[0-9-])", | ||||
|         r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( | ||||
|             al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES | ||||
|         ), | ||||
|         r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), | ||||
|         r"(?<=[{a}])(?:{h})(?=[{al}])".format(a=ALPHA, h=HYPHENS, al=ALPHA_LOWER), | ||||
|         r"(?<=[{a}0-9])[:<>=\/](?=[{a}])".format(a=ALPHA), | ||||
|         r"(?<=[{a}][{el}])(?=[{a}0-9\"])".format(a=ALPHA, el=ELISION) | ||||
|     ] | ||||
| ) | ||||
| 
 | ||||
| TOKENIZER_PREFIXES = _prefixes | ||||
| TOKENIZER_INFIXES = _infixes | ||||
|  |  | |||
|  | @ -2,6 +2,56 @@ | |||
| from __future__ import unicode_literals | ||||
| from ...symbols import ORTH, LEMMA | ||||
| 
 | ||||
| _exc = {"po'": [{ORTH: "po'", LEMMA: "poco"}]} | ||||
| _exc = { | ||||
|     "all'art.": [{ORTH: "all'"}, {ORTH: "art."}], | ||||
|     "dall'art.": [{ORTH: "dall'"}, {ORTH: "art."}], | ||||
|     "dell'art.": [{ORTH: "dell'"}, {ORTH: "art."}], | ||||
|     "L'art.": [{ORTH: "L'"}, {ORTH: "art."}], | ||||
|     "l'art.": [{ORTH: "l'"}, {ORTH: "art."}], | ||||
|     "nell'art.": [{ORTH: "nell'"}, {ORTH: "art."}], | ||||
|     "po'": [{ORTH: "po'", LEMMA: "poco"}], | ||||
|     "sett..": [{ORTH: "sett."}, {ORTH: "."}] | ||||
| } | ||||
| 
 | ||||
| for orth in [ | ||||
|     "..", | ||||
|     "....", | ||||
|     "al.", | ||||
|     "all-path", | ||||
|     "art.", | ||||
|     "Art.", | ||||
|     "artt.", | ||||
|     "att.", | ||||
|     "by-pass", | ||||
|     "c.d.", | ||||
|     "centro-sinistra", | ||||
|     "check-up", | ||||
|     "Civ.", | ||||
|     "cm.", | ||||
|     "Cod.", | ||||
|     "col.", | ||||
|     "Cost.", | ||||
|     "d.C.", | ||||
|     'de"' | ||||
|     "distr.", | ||||
|     "E'", | ||||
|     "ecc.", | ||||
|     "e-mail", | ||||
|     "e/o", | ||||
|     "etc.", | ||||
|     "Jr.", | ||||
|     "n°", | ||||
|     "nord-est", | ||||
|     "pag.", | ||||
|     "Proc.", | ||||
|     "prof.", | ||||
|     "sett.", | ||||
|     "s.p.a.", | ||||
|     "ss.", | ||||
|     "St.", | ||||
|     "tel.", | ||||
|     "week-end", | ||||
| ]: | ||||
|     _exc[orth] = [{ORTH: orth}] | ||||
| 
 | ||||
| TOKENIZER_EXCEPTIONS = _exc | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user