Reorganise Portuguese language data

This commit is contained in:
ines 2017-05-08 15:52:01 +02:00
parent 279895ea83
commit 50510fa947
5 changed files with 85 additions and 115 deletions

View File

@ -1,12 +1,16 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals, print_function from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lemmatizer import LOOKUP
from ..language_data import BASE_EXCEPTIONS
from ..language import Language from ..language import Language
from ..attrs import LANG
from .language_data import *
from ..lemmatizerlookup import Lemmatizer from ..lemmatizerlookup import Lemmatizer
from .lemmatization import LOOK_UP from ..attrs import LANG
from ..util import update_exc
class Portuguese(Language): class Portuguese(Language):
lang = 'pt' lang = 'pt'
@ -15,12 +19,12 @@ class Portuguese(Language):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'pt' lex_attr_getters[LANG] = lambda text: 'pt'
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS stop_words = set(STOP_WORDS)
@classmethod @classmethod
def create_lemmatizer(cls, nlp=None): def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOK_UP) return Lemmatizer(LOOKUP)
EXPORT = Portuguese __all__ = ['Portuguese']

View File

@ -1,7 +1,7 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
LOOK_UP = { LOOKUP = {
"Abris": "abril", "Abris": "abril",
"Agostos": "agosto", "Agostos": "agosto",
"Cérberos": "cérbero", "Cérberos": "cérbero",
@ -824769,4 +824769,4 @@ LOOK_UP = {
"úvidas": "úvido", "úvidas": "úvido",
"úvidos": "úvido", "úvidos": "úvido",
"úvulas": "úvula" "úvulas": "úvula"
} }

21
spacy/pt/lex_attrs.py Normal file
View File

@ -0,0 +1,21 @@
# coding: utf8
from __future__ import unicode_literals
# Number words
NUM_WORDS = set("""
zero um dois três quatro cinco seis sete oito nove dez onze doze treze catorze
quinze dezasseis dezassete dezoito dezanove vinte trinta quarenta cinquenta
sessenta setenta oitenta noventa cem mil milhão bilião trilião quadrilião
""".split())
# Ordinal words
ORDINAL_WORDS = set("""
primeiro segundo terceiro quarto quinto sexto sétimo oitavo nono décimo
vigésimo trigésimo quadragésimo quinquagésimo sexagésimo septuagésimo
octogésimo nonagésimo centésimo ducentésimo trecentésimo quadringentésimo
quingentésimo sexcentésimo septingentésimo octingentésimo nongentésimo
milésimo milionésimo bilionésimo
""".split())

View File

@ -67,22 +67,3 @@ vinda vindo vinte você vocês vos vossa vossas vosso vossos vários vão vêm v
zero zero
""".split()) """.split())
# Number words
NUM_WORDS = set("""
zero um dois três quatro cinco seis sete oito nove dez onze doze treze catorze
quinze dezasseis dezassete dezoito dezanove vinte trinta quarenta cinquenta
sessenta setenta oitenta noventa cem mil milhão bilião trilião quadrilião
""".split())
# Ordinal words
ORDINAL_WORDS = set("""
primeiro segundo terceiro quarto quinto sexto sétimo oitavo nono décimo
vigésimo trigésimo quadragésimo quinquagésimo sexagésimo septuagésimo
octogésimo nonagésimo centésimo ducentésimo trecentésimo quadringentésimo
quingentésimo sexcentésimo septingentésimo octingentésimo nongentésimo
milésimo milionésimo bilionésimo
""".split())

View File

@ -1,111 +1,75 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ..symbols import * from ..symbols import ORTH, LEMMA, NORM
from ..language_data import PRON_LEMMA from ..deprecated import PRON_LEMMA
TOKENIZER_EXCEPTIONS = {}
# Contractions _exc = {
CONTRACTIONS = {}
personal_pronoun = (
"ele", "ela", "eles", "elas"
)
demonstrative_pronouns = (
"este", "esta", "estes", "estas", "isto", "esse", "essa", "esses", "essas",
"isso", "aquele", "aquela", "aqueles", "aquelas", "aquilo"
)
undefined_pronouns = (
"outro", "outra", "outros", "outras"
)
adverbs = (
"aqui", "", "ali", "além"
)
for word in personal_pronoun + demonstrative_pronouns + \
undefined_pronouns + adverbs:
CONTRACTIONS["d" + word] = [
{ORTH: "d", NORM: "de"},
{ORTH: word}
]
for word in personal_pronoun + demonstrative_pronouns + \
undefined_pronouns:
CONTRACTIONS["n" + word] = [
{ORTH: "n", NORM: "em"},
{ORTH: word}
]
# Not so linear contractions "a"+something
CONTRACTIONS.update({
# This one cannot be split into 2
# "à": [
# {ORTH: "à", NORM: "a"},
# {ORTH: "", NORM: "a"}
# ],
"às": [ "às": [
{ORTH: "à", NORM: "a"}, {ORTH: "à", NORM: "a"},
{ORTH: "s", NORM: "as"} {ORTH: "s", NORM: "as"}],
],
"ao": [ "ao": [
{ORTH: "a"}, {ORTH: "a"},
{ORTH: "o"} {ORTH: "o"}],
],
"aos": [ "aos": [
{ORTH: "a"}, {ORTH: "a"},
{ORTH: "os"} {ORTH: "os"}],
],
"àquele": [ "àquele": [
{ORTH: "à", NORM: "a"}, {ORTH: "à", NORM: "a"},
{ORTH: "quele", NORM: "aquele"} {ORTH: "quele", NORM: "aquele"}],
],
"àquela": [ "àquela": [
{ORTH: "à", NORM: "a"}, {ORTH: "à", NORM: "a"},
{ORTH: "quela", NORM: "aquela"} {ORTH: "quela", NORM: "aquela"}],
],
"àqueles": [ "àqueles": [
{ORTH: "à", NORM: "a"}, {ORTH: "à", NORM: "a"},
{ORTH: "queles", NORM: "aqueles"} {ORTH: "queles", NORM: "aqueles"}],
],
"àquelas": [ "àquelas": [
{ORTH: "à", NORM: "a"}, {ORTH: "à", NORM: "a"},
{ORTH: "quelas", NORM: "aquelas"} {ORTH: "quelas", NORM: "aquelas"}],
],
"àquilo": [ "àquilo": [
{ORTH: "à", NORM: "a"}, {ORTH: "à", NORM: "a"},
{ORTH: "quilo", NORM: "aquilo"} {ORTH: "quilo", NORM: "aquilo"}],
],
"aonde": [ "aonde": [
{ORTH: "a"}, {ORTH: "a"},
{ORTH: "onde"} {ORTH: "onde"}]
], }
})
TOKENIZER_EXCEPTIONS.update(CONTRACTIONS)
# Abbreviations with only one ORTH token # Contractions
ORTH_ONLY = [ _per_pron = ["ele", "ela", "eles", "elas"]
"Adm.", _dem_pron = ["este", "esta", "estes", "estas", "isto", "esse", "essa", "esses",
"Dr.", "essas", "isso", "aquele", "aquela", "aqueles", "aquelas", "aquilo"]
"e.g.", _und_pron = ["outro", "outra", "outros", "outras"]
"E.g.", _adv = ["aqui", "", "ali", "além"]
"E.G.",
"Gen.",
"Gov.", for orth in _per_pron + _dem_pron + _und_pron + _adv:
"i.e.", _exc["d" + orth] = [
"I.e.", {ORTH: "d", NORM: "de"},
"I.E.", {ORTH: orth}]
"Jr.",
"Ltd.", for orth in _per_pron + _dem_pron + _und_pron:
"p.m.", _exc["n" + orth] = [
"Ph.D.", {ORTH: "n", NORM: "em"},
"Rep.", {ORTH: orth}]
"Rev.",
"Sen.",
"Sr.",
"Sra.", for orth in [
"vs.", "Adm.", "Dr.", "e.g.", "E.g.", "E.G.", "Gen.", "Gov.", "i.e.", "I.e.",
] "I.E.", "Jr.", "Ltd.", "p.m.", "Ph.D.", "Rep.", "Rev.", "Sen.", "Sr.",
"Sra.", "vs."]:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc)