spaCy/spacy/es/language_data.py

# encoding: utf8
from __future__ import unicode_literals

from .. import language_data as base
from ..language_data import update_exc, strings_to_exc
from ..symbols import ORTH, LEMMA

from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY


def get_time_exc(hours):
    exc = {
        "12m.": [
            {ORTH: "12"},
            {ORTH: "m.", LEMMA: "p.m."}
        ]
    }

    for hour in hours:
        exc["%da.m." % hour] = [
            {ORTH: hour},
            {ORTH: "a.m."}
        ]

        exc["%dp.m." % hour] = [
            {ORTH: hour},
            {ORTH: "p.m."}
        ]

        exc["%dam" % hour] = [
            {ORTH: hour},
            {ORTH: "am", LEMMA: "a.m."}
        ]

        exc["%dpm" % hour] = [
            {ORTH: hour},
            {ORTH: "pm", LEMMA: "p.m."}
        ]
    return exc


STOP_WORDS = set(STOP_WORDS)


TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))


__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
Stub out support for French, Spanish, Italian and Portuguese 2016-11-02 22:02:41 +03:00			`# encoding: utf8`
			`from __future__ import unicode_literals`
Fix infixes in spanish and portuguese 2016-11-02 22:43:12 +03:00
Reorganise language data 2016-12-18 18:54:19 +03:00			`from .. import language_data as base`
			`from ..language_data import update_exc, strings_to_exc`
Add tokenizer exceptions for a.m. and p.m. in Spanish 2016-12-21 20:19:10 +03:00			`from ..symbols import ORTH, LEMMA`
Stub out support for French, Spanish, Italian and Portuguese 2016-11-02 22:02:41 +03:00
Reorganise language data 2016-12-18 18:54:19 +03:00			`from .stop_words import STOP_WORDS`
			`from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY`
Stub out support for French, Spanish, Italian and Portuguese 2016-11-02 22:02:41 +03:00
Add Spanish language data 2016-12-08 21:47:03 +03:00
Add tokenizer exceptions for a.m. and p.m. in Spanish 2016-12-21 20:19:10 +03:00			`def get_time_exc(hours):`
			`exc = {`
			`"12m.": [`
			`{ORTH: "12"},`
			`{ORTH: "m.", LEMMA: "p.m."}`
			`]`
			`}`

			`for hour in hours:`
			`exc["%da.m." % hour] = [`
			`{ORTH: hour},`
			`{ORTH: "a.m."}`
			`]`

			`exc["%dp.m." % hour] = [`
			`{ORTH: hour},`
			`{ORTH: "p.m."}`
			`]`

			`exc["%dam" % hour] = [`
			`{ORTH: hour},`
			`{ORTH: "am", LEMMA: "a.m."}`
			`]`

			`exc["%dpm" % hour] = [`
			`{ORTH: hour},`
			`{ORTH: "pm", LEMMA: "p.m."}`
			`]`
			`return exc`


Reorganise language data 2016-12-18 18:54:19 +03:00			`STOP_WORDS = set(STOP_WORDS)`
Add Spanish language data 2016-12-08 21:47:03 +03:00
Use global abbreviation data languages and remove duplicates 2017-01-08 22:36:00 +03:00
			`TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)`
Reorganise language data 2016-12-18 18:54:19 +03:00			`update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))`
Fix formatting and consistency 2016-12-23 23:35:11 +03:00			`update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))`
Reorganise language data 2016-12-18 18:54:19 +03:00			`update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))`
Use global abbreviation data languages and remove duplicates 2017-01-08 22:36:00 +03:00			`update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))`

Add Spanish language data 2016-12-08 21:47:03 +03:00
Reorganise language data 2016-12-18 18:54:19 +03:00			`__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]`