Reorganise Portuguese language data

This commit is contained in:
ines 2017-05-08 15:52:01 +02:00
parent 279895ea83
commit 50510fa947
5 changed files with 85 additions and 115 deletions

View File

@ -1,12 +1,16 @@
# coding: utf8
from __future__ import unicode_literals, print_function
from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lemmatizer import LOOKUP
from ..language_data import BASE_EXCEPTIONS
from ..language import Language
from ..attrs import LANG
from .language_data import *
from ..lemmatizerlookup import Lemmatizer
from .lemmatization import LOOK_UP
from ..attrs import LANG
from ..util import update_exc
class Portuguese(Language):
lang = 'pt'
@ -15,12 +19,12 @@ class Portuguese(Language):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'pt'
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
stop_words = STOP_WORDS
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOK_UP)
return Lemmatizer(LOOKUP)
EXPORT = Portuguese
__all__ = ['Portuguese']

View File

@ -1,7 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
LOOK_UP = {
LOOKUP = {
"Abris": "abril",
"Agostos": "agosto",
"Cérberos": "cérbero",

21
spacy/pt/lex_attrs.py Normal file
View File

@ -0,0 +1,21 @@
# coding: utf8
from __future__ import unicode_literals
# Number words
NUM_WORDS = set("""
zero um dois três quatro cinco seis sete oito nove dez onze doze treze catorze
quinze dezasseis dezassete dezoito dezanove vinte trinta quarenta cinquenta
sessenta setenta oitenta noventa cem mil milhão bilião trilião quadrilião
""".split())
# Ordinal words
ORDINAL_WORDS = set("""
primeiro segundo terceiro quarto quinto sexto sétimo oitavo nono décimo
vigésimo trigésimo quadragésimo quinquagésimo sexagésimo septuagésimo
octogésimo nonagésimo centésimo ducentésimo trecentésimo quadringentésimo
quingentésimo sexcentésimo septingentésimo octingentésimo nongentésimo
milésimo milionésimo bilionésimo
""".split())

View File

@ -67,22 +67,3 @@ vinda vindo vinte você vocês vos vossa vossas vosso vossos vários vão vêm v
zero
""".split())
# Number words
NUM_WORDS = set("""
zero um dois três quatro cinco seis sete oito nove dez onze doze treze catorze
quinze dezasseis dezassete dezoito dezanove vinte trinta quarenta cinquenta
sessenta setenta oitenta noventa cem mil milhão bilião trilião quadrilião
""".split())
# Ordinal words
ORDINAL_WORDS = set("""
primeiro segundo terceiro quarto quinto sexto sétimo oitavo nono décimo
vigésimo trigésimo quadragésimo quinquagésimo sexagésimo septuagésimo
octogésimo nonagésimo centésimo ducentésimo trecentésimo quadringentésimo
quingentésimo sexcentésimo septingentésimo octingentésimo nongentésimo
milésimo milionésimo bilionésimo
""".split())

View File

@ -1,111 +1,75 @@
# coding: utf8
from __future__ import unicode_literals
from ..symbols import *
from ..language_data import PRON_LEMMA
from ..symbols import ORTH, LEMMA, NORM
from ..deprecated import PRON_LEMMA
TOKENIZER_EXCEPTIONS = {}
# Contractions
CONTRACTIONS = {}
personal_pronoun = (
"ele", "ela", "eles", "elas"
)
demonstrative_pronouns = (
"este", "esta", "estes", "estas", "isto", "esse", "essa", "esses", "essas",
"isso", "aquele", "aquela", "aqueles", "aquelas", "aquilo"
)
undefined_pronouns = (
"outro", "outra", "outros", "outras"
)
adverbs = (
"aqui", "", "ali", "além"
)
for word in personal_pronoun + demonstrative_pronouns + \
undefined_pronouns + adverbs:
CONTRACTIONS["d" + word] = [
{ORTH: "d", NORM: "de"},
{ORTH: word}
]
for word in personal_pronoun + demonstrative_pronouns + \
undefined_pronouns:
CONTRACTIONS["n" + word] = [
{ORTH: "n", NORM: "em"},
{ORTH: word}
]
# Not so linear contractions "a"+something
CONTRACTIONS.update({
# This one cannot be split into 2
# "à": [
# {ORTH: "à", NORM: "a"},
# {ORTH: "", NORM: "a"}
# ],
_exc = {
"às": [
{ORTH: "à", NORM: "a"},
{ORTH: "s", NORM: "as"}
],
{ORTH: "s", NORM: "as"}],
"ao": [
{ORTH: "a"},
{ORTH: "o"}
],
{ORTH: "o"}],
"aos": [
{ORTH: "a"},
{ORTH: "os"}
],
{ORTH: "os"}],
"àquele": [
{ORTH: "à", NORM: "a"},
{ORTH: "quele", NORM: "aquele"}
],
{ORTH: "quele", NORM: "aquele"}],
"àquela": [
{ORTH: "à", NORM: "a"},
{ORTH: "quela", NORM: "aquela"}
],
{ORTH: "quela", NORM: "aquela"}],
"àqueles": [
{ORTH: "à", NORM: "a"},
{ORTH: "queles", NORM: "aqueles"}
],
{ORTH: "queles", NORM: "aqueles"}],
"àquelas": [
{ORTH: "à", NORM: "a"},
{ORTH: "quelas", NORM: "aquelas"}
],
{ORTH: "quelas", NORM: "aquelas"}],
"àquilo": [
{ORTH: "à", NORM: "a"},
{ORTH: "quilo", NORM: "aquilo"}
],
{ORTH: "quilo", NORM: "aquilo"}],
"aonde": [
{ORTH: "a"},
{ORTH: "onde"}
],
})
{ORTH: "onde"}]
}
TOKENIZER_EXCEPTIONS.update(CONTRACTIONS)
# Abbreviations with only one ORTH token
# Contractions
ORTH_ONLY = [
"Adm.",
"Dr.",
"e.g.",
"E.g.",
"E.G.",
"Gen.",
"Gov.",
"i.e.",
"I.e.",
"I.E.",
"Jr.",
"Ltd.",
"p.m.",
"Ph.D.",
"Rep.",
"Rev.",
"Sen.",
"Sr.",
"Sra.",
"vs.",
]
_per_pron = ["ele", "ela", "eles", "elas"]
_dem_pron = ["este", "esta", "estes", "estas", "isto", "esse", "essa", "esses",
"essas", "isso", "aquele", "aquela", "aqueles", "aquelas", "aquilo"]
_und_pron = ["outro", "outra", "outros", "outras"]
_adv = ["aqui", "", "ali", "além"]
for orth in _per_pron + _dem_pron + _und_pron + _adv:
_exc["d" + orth] = [
{ORTH: "d", NORM: "de"},
{ORTH: orth}]
for orth in _per_pron + _dem_pron + _und_pron:
_exc["n" + orth] = [
{ORTH: "n", NORM: "em"},
{ORTH: orth}]
for orth in [
"Adm.", "Dr.", "e.g.", "E.g.", "E.G.", "Gen.", "Gov.", "i.e.", "I.e.",
"I.E.", "Jr.", "Ltd.", "p.m.", "Ph.D.", "Rep.", "Rev.", "Sen.", "Sr.",
"Sra.", "vs."]:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc)