spaCy/spacy/es/language_data.py

# coding: utf8
from __future__ import unicode_literals

from .. import language_data as base
from ..language_data import update_exc, strings_to_exc
from ..symbols import ORTH, LEMMA

from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY


def get_time_exc(hours):
    exc = {
        "12m.": [
            {ORTH: "12"},
            {ORTH: "m.", LEMMA: "p.m."}
        ]
    }

    for hour in hours:
        exc["%sa.m." % hour] = [
            {ORTH: hour},
            {ORTH: "a.m."}
        ]

        exc["%sp.m." % hour] = [
            {ORTH: hour},
            {ORTH: "p.m."}
        ]

        exc["%sam" % hour] = [
            {ORTH: hour},
            {ORTH: "am", LEMMA: "a.m."}
        ]

        exc["%spm" % hour] = [
            {ORTH: hour},
            {ORTH: "pm", LEMMA: "p.m."}
        ]
    return exc


STOP_WORDS = set(STOP_WORDS)


TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(
    ['%d' % hour for hour in range(1, 12 + 1)]))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))


__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
Use consistent unicode declarations 2017-03-12 15:07:28 +03:00			`# coding: utf8`
Stub out support for French, Spanish, Italian and Portuguese 2016-11-02 22:02:41 +03:00			`from __future__ import unicode_literals`
Fix infixes in spanish and portuguese 2016-11-02 22:43:12 +03:00
Reorganise language data 2016-12-18 18:54:19 +03:00			`from .. import language_data as base`
			`from ..language_data import update_exc, strings_to_exc`
Add tokenizer exceptions for a.m. and p.m. in Spanish 2016-12-21 20:19:10 +03:00			`from ..symbols import ORTH, LEMMA`
Stub out support for French, Spanish, Italian and Portuguese 2016-11-02 22:02:41 +03:00
Reorganise language data 2016-12-18 18:54:19 +03:00			`from .stop_words import STOP_WORDS`
			`from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY`
Stub out support for French, Spanish, Italian and Portuguese 2016-11-02 22:02:41 +03:00
Add Spanish language data 2016-12-08 21:47:03 +03:00
Add tokenizer exceptions for a.m. and p.m. in Spanish 2016-12-21 20:19:10 +03:00			`def get_time_exc(hours):`
			`exc = {`
			`"12m.": [`
			`{ORTH: "12"},`
			`{ORTH: "m.", LEMMA: "p.m."}`
			`]`
			`}`

			`for hour in hours:`
Fix Issue #736: Times were being tokenized with incorrect string values. 2017-01-12 13:21:01 +03:00			`exc["%sa.m." % hour] = [`
Add tokenizer exceptions for a.m. and p.m. in Spanish 2016-12-21 20:19:10 +03:00			`{ORTH: hour},`
			`{ORTH: "a.m."}`
			`]`

Fix Issue #736: Times were being tokenized with incorrect string values. 2017-01-12 13:21:01 +03:00			`exc["%sp.m." % hour] = [`
Add tokenizer exceptions for a.m. and p.m. in Spanish 2016-12-21 20:19:10 +03:00			`{ORTH: hour},`
			`{ORTH: "p.m."}`
			`]`

Fix Issue #736: Times were being tokenized with incorrect string values. 2017-01-12 13:21:01 +03:00			`exc["%sam" % hour] = [`
Add tokenizer exceptions for a.m. and p.m. in Spanish 2016-12-21 20:19:10 +03:00			`{ORTH: hour},`
			`{ORTH: "am", LEMMA: "a.m."}`
			`]`

Fix Issue #736: Times were being tokenized with incorrect string values. 2017-01-12 13:21:01 +03:00			`exc["%spm" % hour] = [`
Add tokenizer exceptions for a.m. and p.m. in Spanish 2016-12-21 20:19:10 +03:00			`{ORTH: hour},`
			`{ORTH: "pm", LEMMA: "p.m."}`
			`]`
			`return exc`


Reorganise language data 2016-12-18 18:54:19 +03:00			`STOP_WORDS = set(STOP_WORDS)`
Add Spanish language data 2016-12-08 21:47:03 +03:00
Use global abbreviation data languages and remove duplicates 2017-01-08 22:36:00 +03:00
			`TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)`
Reorganise language data 2016-12-18 18:54:19 +03:00			`update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))`
Fix Issue #736: Times were being tokenized with incorrect string values. 2017-01-12 13:21:01 +03:00			`update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(`
			`['%d' % hour for hour in range(1, 12 + 1)]))`
Reorganise language data 2016-12-18 18:54:19 +03:00			`update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))`
Use global abbreviation data languages and remove duplicates 2017-01-08 22:36:00 +03:00			`update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))`

Add Spanish language data 2016-12-08 21:47:03 +03:00
Reorganise language data 2016-12-18 18:54:19 +03:00			`__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]`