mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	Reorganise Spanish language data
This commit is contained in:
		
							parent
							
								
									c7c21b980f
								
							
						
					
					
						commit
						8e483ec950
					
				|  | @ -1,14 +1,17 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals, print_function | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from os import path | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from .tag_map import TAG_MAP | ||||
| from .stop_words import STOP_WORDS | ||||
| from .lemmatizer import LOOKUP | ||||
| 
 | ||||
| from ..language_data import BASE_EXCEPTIONS | ||||
| from ..language import Language | ||||
| from ..attrs import LANG | ||||
| 
 | ||||
| from .language_data import * | ||||
| from ..lemmatizerlookup import Lemmatizer | ||||
| from .lemmatization import LOOK_UP | ||||
| from ..attrs import LANG | ||||
| from ..util import update_exc | ||||
| 
 | ||||
| 
 | ||||
| class Spanish(Language): | ||||
|     lang = 'es' | ||||
|  | @ -17,10 +20,13 @@ class Spanish(Language): | |||
|         lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||
|         lex_attr_getters[LANG] = lambda text: 'es' | ||||
| 
 | ||||
|         tokenizer_exceptions = TOKENIZER_EXCEPTIONS | ||||
|         tag_map = TAG_MAP | ||||
|         stop_words = STOP_WORDS | ||||
|         tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||
|         tag_map = dict(TAG_MAP) | ||||
|         stop_words = set(STOP_WORDS) | ||||
| 
 | ||||
|         @classmethod | ||||
|         def create_lemmatizer(cls, nlp=None): | ||||
|             return Lemmatizer(LOOKUP) | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| EXPORT = Spanish | ||||
| __all__ = ['Spanish'] | ||||
|  |  | |||
|  | @ -1,7 +1,8 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| LOOK_UP = { | ||||
| 
 | ||||
| LOOKUP = { | ||||
|     "aba": "abar", | ||||
|     "ababa": "abar", | ||||
|     "ababais": "abar", | ||||
|  | @ -1,8 +1,6 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from ..symbols import * | ||||
| 
 | ||||
| 
 | ||||
| TAG_MAP = { | ||||
|     "ADJ___": {"morph": "_", "pos": "ADJ"}, | ||||
|  |  | |||
|  | @ -1,113 +1,82 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from ..symbols import * | ||||
| from ..language_data import PRON_LEMMA, DET_LEMMA | ||||
| from ..symbols import ORTH, LEMMA, TAG, NORM, ADP, DET | ||||
| from ..deprecated import PRON_LEMMA, DET_LEMMA | ||||
| 
 | ||||
| 
 | ||||
| TOKENIZER_EXCEPTIONS = { | ||||
| _exc = { | ||||
|     "al": [ | ||||
|         {ORTH: "a", LEMMA: "a", TAG: ADP}, | ||||
|         {ORTH: "el", LEMMA: "el", TAG: DET} | ||||
|     ], | ||||
|         {ORTH: "l", LEMMA: "el", TAG: DET}], | ||||
| 
 | ||||
|     "consigo": [ | ||||
|         {ORTH: "con", LEMMA: "con"}, | ||||
|         {ORTH: "sigo", LEMMA: PRON_LEMMA, NORM: "sí"} | ||||
|     ], | ||||
|         {ORTH: "sigo", LEMMA: PRON_LEMMA, NORM: "sí"}], | ||||
| 
 | ||||
|     "conmigo": [ | ||||
|         {ORTH: "con", LEMMA: "con"}, | ||||
|         {ORTH: "migo", LEMMA: PRON_LEMMA, NORM: "mí"} | ||||
|     ], | ||||
|         {ORTH: "migo", LEMMA: PRON_LEMMA, NORM: "mí"}], | ||||
| 
 | ||||
|     "contigo": [ | ||||
|         {ORTH: "con", LEMMA: "con"}, | ||||
|         {ORTH: "tigo", LEMMA: PRON_LEMMA, NORM: "ti"} | ||||
|     ], | ||||
|         {ORTH: "tigo", LEMMA: PRON_LEMMA, NORM: "ti"}], | ||||
| 
 | ||||
|     "del": [ | ||||
|         {ORTH: "de", LEMMA: "de", TAG: ADP}, | ||||
|         {ORTH: "l", LEMMA: "el", TAG: DET} | ||||
|     ], | ||||
|         {ORTH: "l", LEMMA: "el", TAG: DET}], | ||||
| 
 | ||||
|     "pel": [ | ||||
|         {ORTH: "pe", LEMMA: "per", TAG: ADP}, | ||||
|         {ORTH: "l", LEMMA: "el", TAG: DET} | ||||
|     ], | ||||
|         {ORTH: "l", LEMMA: "el", TAG: DET}], | ||||
| 
 | ||||
|     "pal": [ | ||||
|         {ORTH: "pa", LEMMA: "para"}, | ||||
|         {ORTH: "l", LEMMA: DET_LEMMA, NORM: "el"} | ||||
|     ], | ||||
|         {ORTH: "l", LEMMA: DET_LEMMA, NORM: "el"}], | ||||
| 
 | ||||
|     "pala": [ | ||||
|         {ORTH: "pa", LEMMA: "para"}, | ||||
|         {ORTH: "la", LEMMA: DET_LEMMA} | ||||
|     ], | ||||
| 
 | ||||
|     "aprox.": [ | ||||
|         {ORTH: "aprox.", LEMMA: "aproximadamente"} | ||||
|     ], | ||||
| 
 | ||||
|     "dna.": [ | ||||
|         {ORTH: "dna.", LEMMA: "docena"} | ||||
|     ], | ||||
| 
 | ||||
|     "esq.": [ | ||||
|         {ORTH: "esq.", LEMMA: "esquina"} | ||||
|     ], | ||||
| 
 | ||||
|     "pág.": [ | ||||
|         {ORTH: "pág.", LEMMA: "página"} | ||||
|     ], | ||||
| 
 | ||||
|     "p.ej.": [ | ||||
|         {ORTH: "p.ej.", LEMMA: "por ejemplo"} | ||||
|     ], | ||||
| 
 | ||||
|     "Ud.": [ | ||||
|         {ORTH: "Ud.", LEMMA: PRON_LEMMA, NORM: "usted"} | ||||
|     ], | ||||
| 
 | ||||
|     "Vd.": [ | ||||
|         {ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"} | ||||
|     ], | ||||
| 
 | ||||
|     "Uds.": [ | ||||
|         {ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"} | ||||
|     ], | ||||
| 
 | ||||
|     "Vds.": [ | ||||
|         {ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"} | ||||
|     ] | ||||
|         {ORTH: "la", LEMMA: DET_LEMMA}] | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| ORTH_ONLY = [ | ||||
|     "a.C.", | ||||
|     "a.J.C.", | ||||
|     "apdo.", | ||||
|     "Av.", | ||||
|     "Avda.", | ||||
|     "Cía.", | ||||
|     "etc.", | ||||
|     "Gob.", | ||||
|     "Gral.", | ||||
|     "Ing.", | ||||
|     "J.C.", | ||||
|     "Lic.", | ||||
|     "m.n.", | ||||
|     "no.", | ||||
|     "núm.", | ||||
|     "P.D.", | ||||
|     "Prof.", | ||||
|     "Profa.", | ||||
|     "q.e.p.d." | ||||
|     "S.A.", | ||||
|     "S.L.", | ||||
|     "s.s.s.", | ||||
|     "Sr.", | ||||
|     "Sra.", | ||||
|     "Srta." | ||||
| ] | ||||
| for exc_data in [ | ||||
|     {ORTH: "aprox.", LEMMA: "aproximadamente"}, | ||||
|     {ORTH: "dna.", LEMMA: "docena"}, | ||||
|     {ORTH: "esq.", LEMMA: "esquina"}, | ||||
|     {ORTH: "pág.", LEMMA: "página"}, | ||||
|     {ORTH: "p.ej.", LEMMA: "por ejemplo"}, | ||||
|     {ORTH: "Ud.", LEMMA: PRON_LEMMA, NORM: "usted"}, | ||||
|     {ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"}, | ||||
|     {ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}, | ||||
|     {ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}]: | ||||
|     _exc[exc_data[ORTH]] = [dict(exc_data)] | ||||
| 
 | ||||
| 
 | ||||
| # Times | ||||
| 
 | ||||
| _exc["12m."] = [ | ||||
|     {ORTH: "12"}, | ||||
|     {ORTH: "m.", LEMMA: "p.m."}] | ||||
| 
 | ||||
| 
 | ||||
| for h in range(1, 12 + 1): | ||||
|     hour = str(h) | ||||
|     for period in ["a.m.", "am"]: | ||||
|         _exc[hour+period] = [ | ||||
|             {ORTH: hour}, | ||||
|             {ORTH: period, LEMMA: "a.m."}] | ||||
|     for period in ["p.m.", "pm"]: | ||||
|         _exc[hour+period] = [ | ||||
|             {ORTH: hour}, | ||||
|             {ORTH: period, LEMMA: "p.m."}] | ||||
| 
 | ||||
| 
 | ||||
| for orth in [ | ||||
|     "a.C.", "a.J.C.", "apdo.", "Av.", "Avda.", "Cía.", "etc.", "Gob.", "Gral.", | ||||
|     "Ing.", "J.C.", "Lic.", "m.n.", "no.", "núm.", "P.D.", "Prof.", "Profa.", | ||||
|     "q.e.p.d.", "S.A.", "S.L.", "s.s.s.", "Sr.", "Sra.", "Srta."]: | ||||
|     _exc[orth] = [{ORTH: orth}] | ||||
| 
 | ||||
| 
 | ||||
| TOKENIZER_EXCEPTIONS = dict(_exc) | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user