diff --git a/spacy/lang/pt/tokenizer_exceptions.py b/spacy/lang/pt/tokenizer_exceptions.py index 5169780e6..c36af6771 100644 --- a/spacy/lang/pt/tokenizer_exceptions.py +++ b/spacy/lang/pt/tokenizer_exceptions.py @@ -4,69 +4,47 @@ from __future__ import unicode_literals from ...symbols import ORTH, NORM -_exc = { - "às": [{ORTH: "à", NORM: "a"}, {ORTH: "s", NORM: "as"}], - "ao": [{ORTH: "a"}, {ORTH: "o"}], - "aos": [{ORTH: "a"}, {ORTH: "os"}], - "àquele": [{ORTH: "à", NORM: "a"}, {ORTH: "quele", NORM: "aquele"}], - "àquela": [{ORTH: "à", NORM: "a"}, {ORTH: "quela", NORM: "aquela"}], - "àqueles": [{ORTH: "à", NORM: "a"}, {ORTH: "queles", NORM: "aqueles"}], - "àquelas": [{ORTH: "à", NORM: "a"}, {ORTH: "quelas", NORM: "aquelas"}], - "àquilo": [{ORTH: "à", NORM: "a"}, {ORTH: "quilo", NORM: "aquilo"}], - "aonde": [{ORTH: "a"}, {ORTH: "onde"}], -} - - -# Contractions -_per_pron = ["ele", "ela", "eles", "elas"] -_dem_pron = [ - "este", - "esta", - "estes", - "estas", - "isto", - "esse", - "essa", - "esses", - "essas", - "isso", - "aquele", - "aquela", - "aqueles", - "aquelas", - "aquilo", -] -_und_pron = ["outro", "outra", "outros", "outras"] -_adv = ["aqui", "aí", "ali", "além"] - - -for orth in _per_pron + _dem_pron + _und_pron + _adv: - _exc["d" + orth] = [{ORTH: "d", NORM: "de"}, {ORTH: orth}] - -for orth in _per_pron + _dem_pron + _und_pron: - _exc["n" + orth] = [{ORTH: "n", NORM: "em"}, {ORTH: orth}] +_exc = {} for orth in [ "Adm.", + "Art.", + "art.", + "Av.", + "av.", + "Cia.", + "dom.", "Dr.", + "dr.", "e.g.", "E.g.", "E.G.", + "e/ou", + "ed.", + "eng.", + "etc.", + "Fund.", "Gen.", "Gov.", "i.e.", "I.e.", "I.E.", + "Inc.", "Jr.", + "km/h", "Ltd.", + "Mr.", "p.m.", "Ph.D.", "Rep.", "Rev.", + "S/A", "Sen.", "Sr.", + "sr.", "Sra.", + "sra.", "vs.", "tel.", "pág.",