mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Reorganise Spanish language data
This commit is contained in:
		
							parent
							
								
									c7c21b980f
								
							
						
					
					
						commit
						8e483ec950
					
				|  | @ -1,14 +1,17 @@ | ||||||
| # coding: utf8 | # coding: utf8 | ||||||
| from __future__ import unicode_literals, print_function | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from os import path | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||||
|  | from .tag_map import TAG_MAP | ||||||
|  | from .stop_words import STOP_WORDS | ||||||
|  | from .lemmatizer import LOOKUP | ||||||
| 
 | 
 | ||||||
|  | from ..language_data import BASE_EXCEPTIONS | ||||||
| from ..language import Language | from ..language import Language | ||||||
| from ..attrs import LANG |  | ||||||
| 
 |  | ||||||
| from .language_data import * |  | ||||||
| from ..lemmatizerlookup import Lemmatizer | from ..lemmatizerlookup import Lemmatizer | ||||||
| from .lemmatization import LOOK_UP | from ..attrs import LANG | ||||||
|  | from ..util import update_exc | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| class Spanish(Language): | class Spanish(Language): | ||||||
|     lang = 'es' |     lang = 'es' | ||||||
|  | @ -17,10 +20,13 @@ class Spanish(Language): | ||||||
|         lex_attr_getters = dict(Language.Defaults.lex_attr_getters) |         lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||||
|         lex_attr_getters[LANG] = lambda text: 'es' |         lex_attr_getters[LANG] = lambda text: 'es' | ||||||
| 
 | 
 | ||||||
|         tokenizer_exceptions = TOKENIZER_EXCEPTIONS |         tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||||
|         tag_map = TAG_MAP |         tag_map = dict(TAG_MAP) | ||||||
|         stop_words = STOP_WORDS |         stop_words = set(STOP_WORDS) | ||||||
|  | 
 | ||||||
|  |         @classmethod | ||||||
|  |         def create_lemmatizer(cls, nlp=None): | ||||||
|  |             return Lemmatizer(LOOKUP) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| 
 | __all__ = ['Spanish'] | ||||||
| EXPORT = Spanish |  | ||||||
|  |  | ||||||
|  | @ -1,7 +1,8 @@ | ||||||
| # coding: utf8 | # coding: utf8 | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| LOOK_UP = { | 
 | ||||||
|  | LOOKUP = { | ||||||
|     "aba": "abar", |     "aba": "abar", | ||||||
|     "ababa": "abar", |     "ababa": "abar", | ||||||
|     "ababais": "abar", |     "ababais": "abar", | ||||||
|  | @ -1,8 +1,6 @@ | ||||||
| # coding: utf8 | # coding: utf8 | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from ..symbols import * |  | ||||||
| 
 |  | ||||||
| 
 | 
 | ||||||
| TAG_MAP = { | TAG_MAP = { | ||||||
|     "ADJ___": {"morph": "_", "pos": "ADJ"}, |     "ADJ___": {"morph": "_", "pos": "ADJ"}, | ||||||
|  |  | ||||||
|  | @ -1,113 +1,82 @@ | ||||||
| # coding: utf8 | # coding: utf8 | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from ..symbols import * | from ..symbols import ORTH, LEMMA, TAG, NORM, ADP, DET | ||||||
| from ..language_data import PRON_LEMMA, DET_LEMMA | from ..deprecated import PRON_LEMMA, DET_LEMMA | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| TOKENIZER_EXCEPTIONS = { | _exc = { | ||||||
|     "al": [ |     "al": [ | ||||||
|         {ORTH: "a", LEMMA: "a", TAG: ADP}, |         {ORTH: "a", LEMMA: "a", TAG: ADP}, | ||||||
|         {ORTH: "el", LEMMA: "el", TAG: DET} |         {ORTH: "l", LEMMA: "el", TAG: DET}], | ||||||
|     ], |  | ||||||
| 
 | 
 | ||||||
|     "consigo": [ |     "consigo": [ | ||||||
|         {ORTH: "con", LEMMA: "con"}, |         {ORTH: "con", LEMMA: "con"}, | ||||||
|         {ORTH: "sigo", LEMMA: PRON_LEMMA, NORM: "sí"} |         {ORTH: "sigo", LEMMA: PRON_LEMMA, NORM: "sí"}], | ||||||
|     ], |  | ||||||
| 
 | 
 | ||||||
|     "conmigo": [ |     "conmigo": [ | ||||||
|         {ORTH: "con", LEMMA: "con"}, |         {ORTH: "con", LEMMA: "con"}, | ||||||
|         {ORTH: "migo", LEMMA: PRON_LEMMA, NORM: "mí"} |         {ORTH: "migo", LEMMA: PRON_LEMMA, NORM: "mí"}], | ||||||
|     ], |  | ||||||
| 
 | 
 | ||||||
|     "contigo": [ |     "contigo": [ | ||||||
|         {ORTH: "con", LEMMA: "con"}, |         {ORTH: "con", LEMMA: "con"}, | ||||||
|         {ORTH: "tigo", LEMMA: PRON_LEMMA, NORM: "ti"} |         {ORTH: "tigo", LEMMA: PRON_LEMMA, NORM: "ti"}], | ||||||
|     ], |  | ||||||
| 
 | 
 | ||||||
|     "del": [ |     "del": [ | ||||||
|         {ORTH: "de", LEMMA: "de", TAG: ADP}, |         {ORTH: "de", LEMMA: "de", TAG: ADP}, | ||||||
|         {ORTH: "l", LEMMA: "el", TAG: DET} |         {ORTH: "l", LEMMA: "el", TAG: DET}], | ||||||
|     ], |  | ||||||
| 
 | 
 | ||||||
|     "pel": [ |     "pel": [ | ||||||
|         {ORTH: "pe", LEMMA: "per", TAG: ADP}, |         {ORTH: "pe", LEMMA: "per", TAG: ADP}, | ||||||
|         {ORTH: "l", LEMMA: "el", TAG: DET} |         {ORTH: "l", LEMMA: "el", TAG: DET}], | ||||||
|     ], |  | ||||||
| 
 | 
 | ||||||
|     "pal": [ |     "pal": [ | ||||||
|         {ORTH: "pa", LEMMA: "para"}, |         {ORTH: "pa", LEMMA: "para"}, | ||||||
|         {ORTH: "l", LEMMA: DET_LEMMA, NORM: "el"} |         {ORTH: "l", LEMMA: DET_LEMMA, NORM: "el"}], | ||||||
|     ], |  | ||||||
| 
 | 
 | ||||||
|     "pala": [ |     "pala": [ | ||||||
|         {ORTH: "pa", LEMMA: "para"}, |         {ORTH: "pa", LEMMA: "para"}, | ||||||
|         {ORTH: "la", LEMMA: DET_LEMMA} |         {ORTH: "la", LEMMA: DET_LEMMA}] | ||||||
|     ], |  | ||||||
| 
 |  | ||||||
|     "aprox.": [ |  | ||||||
|         {ORTH: "aprox.", LEMMA: "aproximadamente"} |  | ||||||
|     ], |  | ||||||
| 
 |  | ||||||
|     "dna.": [ |  | ||||||
|         {ORTH: "dna.", LEMMA: "docena"} |  | ||||||
|     ], |  | ||||||
| 
 |  | ||||||
|     "esq.": [ |  | ||||||
|         {ORTH: "esq.", LEMMA: "esquina"} |  | ||||||
|     ], |  | ||||||
| 
 |  | ||||||
|     "pág.": [ |  | ||||||
|         {ORTH: "pág.", LEMMA: "página"} |  | ||||||
|     ], |  | ||||||
| 
 |  | ||||||
|     "p.ej.": [ |  | ||||||
|         {ORTH: "p.ej.", LEMMA: "por ejemplo"} |  | ||||||
|     ], |  | ||||||
| 
 |  | ||||||
|     "Ud.": [ |  | ||||||
|         {ORTH: "Ud.", LEMMA: PRON_LEMMA, NORM: "usted"} |  | ||||||
|     ], |  | ||||||
| 
 |  | ||||||
|     "Vd.": [ |  | ||||||
|         {ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"} |  | ||||||
|     ], |  | ||||||
| 
 |  | ||||||
|     "Uds.": [ |  | ||||||
|         {ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"} |  | ||||||
|     ], |  | ||||||
| 
 |  | ||||||
|     "Vds.": [ |  | ||||||
|         {ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"} |  | ||||||
|     ] |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| ORTH_ONLY = [ | for exc_data in [ | ||||||
|     "a.C.", |     {ORTH: "aprox.", LEMMA: "aproximadamente"}, | ||||||
|     "a.J.C.", |     {ORTH: "dna.", LEMMA: "docena"}, | ||||||
|     "apdo.", |     {ORTH: "esq.", LEMMA: "esquina"}, | ||||||
|     "Av.", |     {ORTH: "pág.", LEMMA: "página"}, | ||||||
|     "Avda.", |     {ORTH: "p.ej.", LEMMA: "por ejemplo"}, | ||||||
|     "Cía.", |     {ORTH: "Ud.", LEMMA: PRON_LEMMA, NORM: "usted"}, | ||||||
|     "etc.", |     {ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"}, | ||||||
|     "Gob.", |     {ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}, | ||||||
|     "Gral.", |     {ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}]: | ||||||
|     "Ing.", |     _exc[exc_data[ORTH]] = [dict(exc_data)] | ||||||
|     "J.C.", | 
 | ||||||
|     "Lic.", | 
 | ||||||
|     "m.n.", | # Times | ||||||
|     "no.", | 
 | ||||||
|     "núm.", | _exc["12m."] = [ | ||||||
|     "P.D.", |     {ORTH: "12"}, | ||||||
|     "Prof.", |     {ORTH: "m.", LEMMA: "p.m."}] | ||||||
|     "Profa.", | 
 | ||||||
|     "q.e.p.d." | 
 | ||||||
|     "S.A.", | for h in range(1, 12 + 1): | ||||||
|     "S.L.", |     hour = str(h) | ||||||
|     "s.s.s.", |     for period in ["a.m.", "am"]: | ||||||
|     "Sr.", |         _exc[hour+period] = [ | ||||||
|     "Sra.", |             {ORTH: hour}, | ||||||
|     "Srta." |             {ORTH: period, LEMMA: "a.m."}] | ||||||
| ] |     for period in ["p.m.", "pm"]: | ||||||
|  |         _exc[hour+period] = [ | ||||||
|  |             {ORTH: hour}, | ||||||
|  |             {ORTH: period, LEMMA: "p.m."}] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | for orth in [ | ||||||
|  |     "a.C.", "a.J.C.", "apdo.", "Av.", "Avda.", "Cía.", "etc.", "Gob.", "Gral.", | ||||||
|  |     "Ing.", "J.C.", "Lic.", "m.n.", "no.", "núm.", "P.D.", "Prof.", "Profa.", | ||||||
|  |     "q.e.p.d.", "S.A.", "S.L.", "s.s.s.", "Sr.", "Sra.", "Srta."]: | ||||||
|  |     _exc[orth] = [{ORTH: orth}] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | TOKENIZER_EXCEPTIONS = dict(_exc) | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user