From 50510fa9473ae80fe14a0760f1c4e6cbdcdfb1e7 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 8 May 2017 15:52:01 +0200 Subject: [PATCH] Reorganise Portuguese language data --- spacy/pt/__init__.py | 22 +-- spacy/pt/{lemmatization.py => lemmatizer.py} | 4 +- spacy/pt/lex_attrs.py | 21 +++ spacy/pt/stop_words.py | 19 --- spacy/pt/tokenizer_exceptions.py | 134 +++++++------------ 5 files changed, 85 insertions(+), 115 deletions(-) rename spacy/pt/{lemmatization.py => lemmatizer.py} (99%) create mode 100644 spacy/pt/lex_attrs.py diff --git a/spacy/pt/__init__.py b/spacy/pt/__init__.py index e473e0d23..0d68cf393 100644 --- a/spacy/pt/__init__.py +++ b/spacy/pt/__init__.py @@ -1,12 +1,16 @@ # coding: utf8 -from __future__ import unicode_literals, print_function +from __future__ import unicode_literals +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .stop_words import STOP_WORDS +from .lemmatizer import LOOKUP + +from ..language_data import BASE_EXCEPTIONS from ..language import Language -from ..attrs import LANG - -from .language_data import * from ..lemmatizerlookup import Lemmatizer -from .lemmatization import LOOK_UP +from ..attrs import LANG +from ..util import update_exc + class Portuguese(Language): lang = 'pt' @@ -15,12 +19,12 @@ class Portuguese(Language): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'pt' - tokenizer_exceptions = TOKENIZER_EXCEPTIONS - stop_words = STOP_WORDS + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + stop_words = set(STOP_WORDS) @classmethod def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOK_UP) + return Lemmatizer(LOOKUP) -EXPORT = Portuguese \ No newline at end of file +__all__ = ['Portuguese'] diff --git a/spacy/pt/lemmatization.py b/spacy/pt/lemmatizer.py similarity index 99% rename from spacy/pt/lemmatization.py rename to spacy/pt/lemmatizer.py index e8243b49b..01765e04f 100644 --- a/spacy/pt/lemmatization.py +++ b/spacy/pt/lemmatizer.py @@ -1,7 +1,7 @@ # coding: utf8 from __future__ import unicode_literals -LOOK_UP = { +LOOKUP = { "Abris": "abril", "Agostos": "agosto", "Cérberos": "cérbero", @@ -824769,4 +824769,4 @@ LOOK_UP = { "úvidas": "úvido", "úvidos": "úvido", "úvulas": "úvula" -} \ No newline at end of file +} diff --git a/spacy/pt/lex_attrs.py b/spacy/pt/lex_attrs.py new file mode 100644 index 000000000..db54a1631 --- /dev/null +++ b/spacy/pt/lex_attrs.py @@ -0,0 +1,21 @@ +# coding: utf8 +from __future__ import unicode_literals + + +# Number words + +NUM_WORDS = set(""" +zero um dois três quatro cinco seis sete oito nove dez onze doze treze catorze +quinze dezasseis dezassete dezoito dezanove vinte trinta quarenta cinquenta +sessenta setenta oitenta noventa cem mil milhão bilião trilião quadrilião +""".split()) + +# Ordinal words + +ORDINAL_WORDS = set(""" +primeiro segundo terceiro quarto quinto sexto sétimo oitavo nono décimo +vigésimo trigésimo quadragésimo quinquagésimo sexagésimo septuagésimo +octogésimo nonagésimo centésimo ducentésimo trecentésimo quadringentésimo +quingentésimo sexcentésimo septingentésimo octingentésimo nongentésimo +milésimo milionésimo bilionésimo +""".split()) diff --git a/spacy/pt/stop_words.py b/spacy/pt/stop_words.py index a24356881..a18e8ded3 100644 --- a/spacy/pt/stop_words.py +++ b/spacy/pt/stop_words.py @@ -67,22 +67,3 @@ vinda vindo vinte você vocês vos vossa vossas vosso vossos vários vão vêm v zero """.split()) - - -# Number words - -NUM_WORDS = set(""" -zero um dois três quatro cinco seis sete oito nove dez onze doze treze catorze -quinze dezasseis dezassete dezoito dezanove vinte trinta quarenta cinquenta -sessenta setenta oitenta noventa cem mil milhão bilião trilião quadrilião -""".split()) - -# Ordinal words - -ORDINAL_WORDS = set(""" -primeiro segundo terceiro quarto quinto sexto sétimo oitavo nono décimo -vigésimo trigésimo quadragésimo quinquagésimo sexagésimo septuagésimo -octogésimo nonagésimo centésimo ducentésimo trecentésimo quadringentésimo -quingentésimo sexcentésimo septingentésimo octingentésimo nongentésimo -milésimo milionésimo bilionésimo -""".split()) diff --git a/spacy/pt/tokenizer_exceptions.py b/spacy/pt/tokenizer_exceptions.py index 1e02f6c6e..087014ca1 100644 --- a/spacy/pt/tokenizer_exceptions.py +++ b/spacy/pt/tokenizer_exceptions.py @@ -1,111 +1,75 @@ # coding: utf8 from __future__ import unicode_literals -from ..symbols import * -from ..language_data import PRON_LEMMA +from ..symbols import ORTH, LEMMA, NORM +from ..deprecated import PRON_LEMMA -TOKENIZER_EXCEPTIONS = {} -# Contractions -CONTRACTIONS = {} - -personal_pronoun = ( - "ele", "ela", "eles", "elas" -) -demonstrative_pronouns = ( - "este", "esta", "estes", "estas", "isto", "esse", "essa", "esses", "essas", - "isso", "aquele", "aquela", "aqueles", "aquelas", "aquilo" -) -undefined_pronouns = ( - "outro", "outra", "outros", "outras" -) -adverbs = ( - "aqui", "aí", "ali", "além" -) - -for word in personal_pronoun + demonstrative_pronouns + \ - undefined_pronouns + adverbs: - CONTRACTIONS["d" + word] = [ - {ORTH: "d", NORM: "de"}, - {ORTH: word} - ] - -for word in personal_pronoun + demonstrative_pronouns + \ - undefined_pronouns: - CONTRACTIONS["n" + word] = [ - {ORTH: "n", NORM: "em"}, - {ORTH: word} - ] - -# Not so linear contractions "a"+something - -CONTRACTIONS.update({ - # This one cannot be split into 2 - # "à": [ - # {ORTH: "à", NORM: "a"}, - # {ORTH: "", NORM: "a"} - # ], +_exc = { "às": [ {ORTH: "à", NORM: "a"}, - {ORTH: "s", NORM: "as"} - ], + {ORTH: "s", NORM: "as"}], + "ao": [ {ORTH: "a"}, - {ORTH: "o"} - ], + {ORTH: "o"}], + "aos": [ {ORTH: "a"}, - {ORTH: "os"} - ], + {ORTH: "os"}], + "àquele": [ {ORTH: "à", NORM: "a"}, - {ORTH: "quele", NORM: "aquele"} - ], + {ORTH: "quele", NORM: "aquele"}], + "àquela": [ {ORTH: "à", NORM: "a"}, - {ORTH: "quela", NORM: "aquela"} - ], + {ORTH: "quela", NORM: "aquela"}], + "àqueles": [ {ORTH: "à", NORM: "a"}, - {ORTH: "queles", NORM: "aqueles"} - ], + {ORTH: "queles", NORM: "aqueles"}], + "àquelas": [ {ORTH: "à", NORM: "a"}, - {ORTH: "quelas", NORM: "aquelas"} - ], + {ORTH: "quelas", NORM: "aquelas"}], + "àquilo": [ {ORTH: "à", NORM: "a"}, - {ORTH: "quilo", NORM: "aquilo"} - ], + {ORTH: "quilo", NORM: "aquilo"}], + "aonde": [ {ORTH: "a"}, - {ORTH: "onde"} - ], -}) + {ORTH: "onde"}] +} -TOKENIZER_EXCEPTIONS.update(CONTRACTIONS) -# Abbreviations with only one ORTH token +# Contractions -ORTH_ONLY = [ - "Adm.", - "Dr.", - "e.g.", - "E.g.", - "E.G.", - "Gen.", - "Gov.", - "i.e.", - "I.e.", - "I.E.", - "Jr.", - "Ltd.", - "p.m.", - "Ph.D.", - "Rep.", - "Rev.", - "Sen.", - "Sr.", - "Sra.", - "vs.", -] +_per_pron = ["ele", "ela", "eles", "elas"] +_dem_pron = ["este", "esta", "estes", "estas", "isto", "esse", "essa", "esses", + "essas", "isso", "aquele", "aquela", "aqueles", "aquelas", "aquilo"] +_und_pron = ["outro", "outra", "outros", "outras"] +_adv = ["aqui", "aí", "ali", "além"] + + +for orth in _per_pron + _dem_pron + _und_pron + _adv: + _exc["d" + orth] = [ + {ORTH: "d", NORM: "de"}, + {ORTH: orth}] + +for orth in _per_pron + _dem_pron + _und_pron: + _exc["n" + orth] = [ + {ORTH: "n", NORM: "em"}, + {ORTH: orth}] + + + +for orth in [ + "Adm.", "Dr.", "e.g.", "E.g.", "E.G.", "Gen.", "Gov.", "i.e.", "I.e.", + "I.E.", "Jr.", "Ltd.", "p.m.", "Ph.D.", "Rep.", "Rev.", "Sen.", "Sr.", + "Sra.", "vs."]: + _exc[orth] = [{ORTH: orth}] + + +TOKENIZER_EXCEPTIONS = dict(_exc)