From 50510fa9473ae80fe14a0760f1c4e6cbdcdfb1e7 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 8 May 2017 15:52:01 +0200
Subject: [PATCH] Reorganise Portuguese language data

---
 spacy/pt/__init__.py                         |  22 +--
 spacy/pt/{lemmatization.py => lemmatizer.py} |   4 +-
 spacy/pt/lex_attrs.py                        |  21 +++
 spacy/pt/stop_words.py                       |  19 ---
 spacy/pt/tokenizer_exceptions.py             | 134 +++++++------------
 5 files changed, 85 insertions(+), 115 deletions(-)
 rename spacy/pt/{lemmatization.py => lemmatizer.py} (99%)
 create mode 100644 spacy/pt/lex_attrs.py

diff --git a/spacy/pt/__init__.py b/spacy/pt/__init__.py
index e473e0d23..0d68cf393 100644
--- a/spacy/pt/__init__.py
+++ b/spacy/pt/__init__.py
@@ -1,12 +1,16 @@
 # coding: utf8
-from __future__ import unicode_literals, print_function
+from __future__ import unicode_literals
 
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from .stop_words import STOP_WORDS
+from .lemmatizer import LOOKUP
+
+from ..language_data import BASE_EXCEPTIONS
 from ..language import Language
-from ..attrs import LANG
-
-from .language_data import *
 from ..lemmatizerlookup import Lemmatizer
-from .lemmatization import LOOK_UP
+from ..attrs import LANG
+from ..util import update_exc
+
 
 class Portuguese(Language):
     lang = 'pt'
@@ -15,12 +19,12 @@ class Portuguese(Language):
         lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
         lex_attr_getters[LANG] = lambda text: 'pt'
 
-        tokenizer_exceptions = TOKENIZER_EXCEPTIONS
-        stop_words = STOP_WORDS
+        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+        stop_words = set(STOP_WORDS)
 
         @classmethod
         def create_lemmatizer(cls, nlp=None):
-            return Lemmatizer(LOOK_UP)
+            return Lemmatizer(LOOKUP)
 
 
-EXPORT = Portuguese
\ No newline at end of file
+__all__ = ['Portuguese']
diff --git a/spacy/pt/lemmatization.py b/spacy/pt/lemmatizer.py
similarity index 99%
rename from spacy/pt/lemmatization.py
rename to spacy/pt/lemmatizer.py
index e8243b49b..01765e04f 100644
--- a/spacy/pt/lemmatization.py
+++ b/spacy/pt/lemmatizer.py
@@ -1,7 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals
 
-LOOK_UP = {
+LOOKUP = {
     "Abris": "abril",
     "Agostos": "agosto",
     "Cérberos": "cérbero",
@@ -824769,4 +824769,4 @@ LOOK_UP = {
     "úvidas": "úvido",
     "úvidos": "úvido",
     "úvulas": "úvula"
-}
\ No newline at end of file
+}
diff --git a/spacy/pt/lex_attrs.py b/spacy/pt/lex_attrs.py
new file mode 100644
index 000000000..db54a1631
--- /dev/null
+++ b/spacy/pt/lex_attrs.py
@@ -0,0 +1,21 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+# Number words
+
+NUM_WORDS = set("""
+zero um dois três quatro cinco seis sete oito nove dez onze doze treze catorze
+quinze dezasseis dezassete dezoito dezanove vinte trinta quarenta cinquenta
+sessenta setenta oitenta noventa cem mil milhão bilião trilião quadrilião
+""".split())
+
+# Ordinal words
+
+ORDINAL_WORDS = set("""
+primeiro segundo terceiro quarto quinto sexto sétimo oitavo nono décimo
+vigésimo trigésimo quadragésimo quinquagésimo sexagésimo septuagésimo
+octogésimo nonagésimo centésimo ducentésimo trecentésimo quadringentésimo
+quingentésimo sexcentésimo septingentésimo octingentésimo nongentésimo
+milésimo milionésimo bilionésimo
+""".split())
diff --git a/spacy/pt/stop_words.py b/spacy/pt/stop_words.py
index a24356881..a18e8ded3 100644
--- a/spacy/pt/stop_words.py
+++ b/spacy/pt/stop_words.py
@@ -67,22 +67,3 @@ vinda vindo vinte você vocês vos vossa vossas vosso vossos vários vão vêm v
 
 zero
 """.split())
-
-
-# Number words
-
-NUM_WORDS = set("""
-zero um dois três quatro cinco seis sete oito nove dez onze doze treze catorze
-quinze dezasseis dezassete dezoito dezanove vinte trinta quarenta cinquenta
-sessenta setenta oitenta noventa cem mil milhão bilião trilião quadrilião
-""".split())
-
-# Ordinal words
-
-ORDINAL_WORDS = set("""
-primeiro segundo terceiro quarto quinto sexto sétimo oitavo nono décimo
-vigésimo trigésimo quadragésimo quinquagésimo sexagésimo septuagésimo
-octogésimo nonagésimo centésimo ducentésimo trecentésimo quadringentésimo
-quingentésimo sexcentésimo septingentésimo octingentésimo nongentésimo
-milésimo milionésimo bilionésimo
-""".split())
diff --git a/spacy/pt/tokenizer_exceptions.py b/spacy/pt/tokenizer_exceptions.py
index 1e02f6c6e..087014ca1 100644
--- a/spacy/pt/tokenizer_exceptions.py
+++ b/spacy/pt/tokenizer_exceptions.py
@@ -1,111 +1,75 @@
 # coding: utf8
 from __future__ import unicode_literals
 
-from ..symbols import *
-from ..language_data import PRON_LEMMA
+from ..symbols import ORTH, LEMMA, NORM
+from ..deprecated import PRON_LEMMA
 
-TOKENIZER_EXCEPTIONS = {}
 
-# Contractions
-CONTRACTIONS = {}
-
-personal_pronoun = (
-    "ele", "ela", "eles", "elas"
-)
-demonstrative_pronouns = (
-    "este", "esta", "estes", "estas", "isto", "esse", "essa", "esses", "essas",
-    "isso", "aquele", "aquela", "aqueles", "aquelas", "aquilo"
-)
-undefined_pronouns = (
-    "outro", "outra", "outros", "outras"
-)
-adverbs = (
-    "aqui", "aí", "ali", "além"
-)
-
-for word in personal_pronoun + demonstrative_pronouns + \
-            undefined_pronouns + adverbs:
-    CONTRACTIONS["d" + word] = [
-        {ORTH: "d", NORM: "de"},
-        {ORTH: word}
-    ]
-
-for word in personal_pronoun + demonstrative_pronouns + \
-            undefined_pronouns:
-    CONTRACTIONS["n" + word] = [
-        {ORTH: "n", NORM: "em"},
-        {ORTH: word}
-    ]
-
-# Not so linear contractions "a"+something
-
-CONTRACTIONS.update({
-    # This one cannot be split into 2
-    # "à": [
-    #     {ORTH: "à", NORM: "a"},
-    #     {ORTH: "", NORM: "a"}
-    # ],
+_exc = {
     "às": [
         {ORTH: "à", NORM: "a"},
-        {ORTH: "s", NORM: "as"}
-    ],
+        {ORTH: "s", NORM: "as"}],
+
     "ao": [
         {ORTH: "a"},
-        {ORTH: "o"}
-    ],
+        {ORTH: "o"}],
+
     "aos": [
         {ORTH: "a"},
-        {ORTH: "os"}
-    ],
+        {ORTH: "os"}],
+
     "àquele": [
         {ORTH: "à", NORM: "a"},
-        {ORTH: "quele", NORM: "aquele"}
-    ],
+        {ORTH: "quele", NORM: "aquele"}],
+
     "àquela": [
         {ORTH: "à", NORM: "a"},
-        {ORTH: "quela", NORM: "aquela"}
-    ],
+        {ORTH: "quela", NORM: "aquela"}],
+
     "àqueles": [
         {ORTH: "à", NORM: "a"},
-        {ORTH: "queles", NORM: "aqueles"}
-    ],
+        {ORTH: "queles", NORM: "aqueles"}],
+
     "àquelas": [
         {ORTH: "à", NORM: "a"},
-        {ORTH: "quelas", NORM: "aquelas"}
-    ],
+        {ORTH: "quelas", NORM: "aquelas"}],
+
     "àquilo": [
         {ORTH: "à", NORM: "a"},
-        {ORTH: "quilo", NORM: "aquilo"}
-    ],
+        {ORTH: "quilo", NORM: "aquilo"}],
+
     "aonde": [
         {ORTH: "a"},
-        {ORTH: "onde"}
-    ],
-})
+        {ORTH: "onde"}]
+}
 
-TOKENIZER_EXCEPTIONS.update(CONTRACTIONS)
 
-# Abbreviations with only one ORTH token
+# Contractions
 
-ORTH_ONLY = [
-    "Adm.",
-    "Dr.",
-    "e.g.",
-    "E.g.",
-    "E.G.",
-    "Gen.",
-    "Gov.",
-    "i.e.",
-    "I.e.",
-    "I.E.",
-    "Jr.",
-    "Ltd.",
-    "p.m.",
-    "Ph.D.",
-    "Rep.",
-    "Rev.",
-    "Sen.",
-    "Sr.",
-    "Sra.",
-    "vs.",
-]
+_per_pron = ["ele", "ela", "eles", "elas"]
+_dem_pron = ["este", "esta", "estes", "estas", "isto", "esse", "essa", "esses",
+             "essas", "isso", "aquele", "aquela", "aqueles", "aquelas", "aquilo"]
+_und_pron = ["outro", "outra", "outros", "outras"]
+_adv = ["aqui", "aí", "ali", "além"]
+
+
+for orth in _per_pron + _dem_pron + _und_pron + _adv:
+    _exc["d" + orth] = [
+        {ORTH: "d", NORM: "de"},
+        {ORTH: orth}]
+
+for orth in _per_pron + _dem_pron + _und_pron:
+    _exc["n" + orth] = [
+        {ORTH: "n", NORM: "em"},
+        {ORTH: orth}]
+
+
+
+for orth in [
+    "Adm.", "Dr.", "e.g.", "E.g.", "E.G.", "Gen.", "Gov.", "i.e.", "I.e.",
+    "I.E.", "Jr.", "Ltd.", "p.m.", "Ph.D.", "Rep.", "Rev.", "Sen.", "Sr.",
+    "Sra.", "vs."]:
+    _exc[orth] = [{ORTH: orth}]
+
+
+TOKENIZER_EXCEPTIONS = dict(_exc)