Reorganise Portuguese language data

2025-08-31 01:15:06 +03:00 · 2017-05-08 15:52:01 +02:00 · 2017-05-08 15:52:01 +02:00 · 50510fa947
commit 50510fa947
parent 279895ea83
5 changed files with 85 additions and 115 deletions
--- a/spacy/pt/init.py
+++ b/spacy/pt/init.py
@ -1,12 +1,16 @@
 # coding: utf8
-from __future__ import unicode_literals, print_function
+from __future__ import unicode_literals
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lemmatizer import LOOKUP
 from ..language_data import BASE_EXCEPTIONS
 from ..language import Language
 from ..attrs import LANG
 from .language_data import *
 from ..lemmatizerlookup import Lemmatizer
-from .lemmatization import LOOK_UP
+from ..attrs import LANG
 from ..util import update_exc
 class Portuguese(Language):
    lang = 'pt'
@ -15,12 +19,12 @@ class Portuguese(Language):
        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
        lex_attr_getters[LANG] = lambda text: 'pt'
-        tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-        stop_words = STOP_WORDS
+        stop_words = set(STOP_WORDS)
        @classmethod
        def create_lemmatizer(cls, nlp=None):
-            return Lemmatizer(LOOK_UP)
+            return Lemmatizer(LOOKUP)
-EXPORT = Portuguese
+__all__ = ['Portuguese']
--- a/spacy/pt/lemmatization.py
+++ b/spacy/pt/lemmatization.py
@ -1,7 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals
-LOOK_UP = {
+LOOKUP = {
    "Abris": "abril",
    "Agostos": "agosto",
    "Cérberos": "cérbero",
--- a/spacy/pt/lex_attrs.py
+++ b/spacy/pt/lex_attrs.py
@ -0,0 +1,21 @@
 # coding: utf8
 from __future__ import unicode_literals
 # Number words
 NUM_WORDS = set("""
 zero um dois três quatro cinco seis sete oito nove dez onze doze treze catorze
 quinze dezasseis dezassete dezoito dezanove vinte trinta quarenta cinquenta
 sessenta setenta oitenta noventa cem mil milhão bilião trilião quadrilião
 """.split())
 # Ordinal words
 ORDINAL_WORDS = set("""
 primeiro segundo terceiro quarto quinto sexto sétimo oitavo nono décimo
 vigésimo trigésimo quadragésimo quinquagésimo sexagésimo septuagésimo
 octogésimo nonagésimo centésimo ducentésimo trecentésimo quadringentésimo
 quingentésimo sexcentésimo septingentésimo octingentésimo nongentésimo
 milésimo milionésimo bilionésimo
 """.split())
--- a/spacy/pt/stop_words.py
+++ b/spacy/pt/stop_words.py
@ -67,22 +67,3 @@ vinda vindo vinte você vocês vos vossa vossas vosso vossos vários vão vêm v
 zero
 """.split())
 # Number words
 NUM_WORDS = set("""
 zero um dois três quatro cinco seis sete oito nove dez onze doze treze catorze
 quinze dezasseis dezassete dezoito dezanove vinte trinta quarenta cinquenta
 sessenta setenta oitenta noventa cem mil milhão bilião trilião quadrilião
 """.split())
 # Ordinal words
 ORDINAL_WORDS = set("""
 primeiro segundo terceiro quarto quinto sexto sétimo oitavo nono décimo
 vigésimo trigésimo quadragésimo quinquagésimo sexagésimo septuagésimo
 octogésimo nonagésimo centésimo ducentésimo trecentésimo quadringentésimo
 quingentésimo sexcentésimo septingentésimo octingentésimo nongentésimo
 milésimo milionésimo bilionésimo
 """.split())
--- a/spacy/pt/tokenizer_exceptions.py
+++ b/spacy/pt/tokenizer_exceptions.py
@ -1,111 +1,75 @@
 # coding: utf8
 from __future__ import unicode_literals
-from ..symbols import *
+from ..symbols import ORTH, LEMMA, NORM
-from ..language_data import PRON_LEMMA
+from ..deprecated import PRON_LEMMA
 TOKENIZER_EXCEPTIONS = {}
-# Contractions
+_exc = {
 CONTRACTIONS = {}
 personal_pronoun = (
    "ele", "ela", "eles", "elas"
 )
 demonstrative_pronouns = (
    "este", "esta", "estes", "estas", "isto", "esse", "essa", "esses", "essas",
    "isso", "aquele", "aquela", "aqueles", "aquelas", "aquilo"
 )
 undefined_pronouns = (
    "outro", "outra", "outros", "outras"
 )
 adverbs = (
    "aqui", "aí", "ali", "além"
 )
 for word in personal_pronoun + demonstrative_pronouns + \
            undefined_pronouns + adverbs:
    CONTRACTIONS["d" + word] = [
        {ORTH: "d", NORM: "de"},
        {ORTH: word}
    ]
 for word in personal_pronoun + demonstrative_pronouns + \
            undefined_pronouns:
    CONTRACTIONS["n" + word] = [
        {ORTH: "n", NORM: "em"},
        {ORTH: word}
    ]
 # Not so linear contractions "a"+something
 CONTRACTIONS.update({
    # This one cannot be split into 2
    # "à": [
    #     {ORTH: "à", NORM: "a"},
    #     {ORTH: "", NORM: "a"}
    # ],
    "às": [
        {ORTH: "à", NORM: "a"},
-        {ORTH: "s", NORM: "as"}
+        {ORTH: "s", NORM: "as"}],
-    ],
+
    "ao": [
        {ORTH: "a"},
-        {ORTH: "o"}
+        {ORTH: "o"}],
-    ],
+
    "aos": [
        {ORTH: "a"},
-        {ORTH: "os"}
+        {ORTH: "os"}],
-    ],
+
    "àquele": [
        {ORTH: "à", NORM: "a"},
-        {ORTH: "quele", NORM: "aquele"}
+        {ORTH: "quele", NORM: "aquele"}],
-    ],
+
    "àquela": [
        {ORTH: "à", NORM: "a"},
-        {ORTH: "quela", NORM: "aquela"}
+        {ORTH: "quela", NORM: "aquela"}],
-    ],
+
    "àqueles": [
        {ORTH: "à", NORM: "a"},
-        {ORTH: "queles", NORM: "aqueles"}
+        {ORTH: "queles", NORM: "aqueles"}],
-    ],
+
    "àquelas": [
        {ORTH: "à", NORM: "a"},
-        {ORTH: "quelas", NORM: "aquelas"}
+        {ORTH: "quelas", NORM: "aquelas"}],
-    ],
+
    "àquilo": [
        {ORTH: "à", NORM: "a"},
-        {ORTH: "quilo", NORM: "aquilo"}
+        {ORTH: "quilo", NORM: "aquilo"}],
-    ],
+
    "aonde": [
        {ORTH: "a"},
-        {ORTH: "onde"}
+        {ORTH: "onde"}]
-    ],
+}
 })
 TOKENIZER_EXCEPTIONS.update(CONTRACTIONS)
-# Abbreviations with only one ORTH token
+# Contractions
-ORTH_ONLY = [
+_per_pron = ["ele", "ela", "eles", "elas"]
-    "Adm.",
+_dem_pron = ["este", "esta", "estes", "estas", "isto", "esse", "essa", "esses",
-    "Dr.",
+             "essas", "isso", "aquele", "aquela", "aqueles", "aquelas", "aquilo"]
-    "e.g.",
+_und_pron = ["outro", "outra", "outros", "outras"]
-    "E.g.",
+_adv = ["aqui", "aí", "ali", "além"]
-    "E.G.",
+
-    "Gen.",
+
-    "Gov.",
+for orth in _per_pron + _dem_pron + _und_pron + _adv:
-    "i.e.",
+    _exc["d" + orth] = [
-    "I.e.",
+        {ORTH: "d", NORM: "de"},
-    "I.E.",
+        {ORTH: orth}]
-    "Jr.",
+
-    "Ltd.",
+for orth in _per_pron + _dem_pron + _und_pron:
-    "p.m.",
+    _exc["n" + orth] = [
-    "Ph.D.",
+        {ORTH: "n", NORM: "em"},
-    "Rep.",
+        {ORTH: orth}]
-    "Rev.",
+
-    "Sen.",
+
-    "Sr.",
+
-    "Sra.",
+for orth in [
-    "vs.",
+    "Adm.", "Dr.", "e.g.", "E.g.", "E.G.", "Gen.", "Gov.", "i.e.", "I.e.",
-]
+    "I.E.", "Jr.", "Ltd.", "p.m.", "Ph.D.", "Rep.", "Rev.", "Sen.", "Sr.",
    "Sra.", "vs."]:
    _exc[orth] = [{ORTH: orth}]
 TOKENIZER_EXCEPTIONS = dict(_exc)