Modifications/updates to Portuguese tokenization (#5203)

Modifications to Portuguese tokenization for UD_Portuguese-Bosque. Instead of splitting contactions as exceptions, they are kept as merged tokens.
2026-03-06 21:01:34 +03:00 · 2020-03-25 11:27:53 +01:00 · 2020-03-25 11:27:53 +01:00 · 923a453449
commit 923a453449
parent 4117a5c705
1 changed files with 19 additions and 41 deletions
--- a/spacy/lang/pt/tokenizer_exceptions.py
+++ b/spacy/lang/pt/tokenizer_exceptions.py
@ -4,69 +4,47 @@ from __future__ import unicode_literals
 from ...symbols import ORTH, NORM


-_exc = {
-    "às": [{ORTH: "à", NORM: "a"}, {ORTH: "s", NORM: "as"}],
-    "ao": [{ORTH: "a"}, {ORTH: "o"}],
-    "aos": [{ORTH: "a"}, {ORTH: "os"}],
-    "àquele": [{ORTH: "à", NORM: "a"}, {ORTH: "quele", NORM: "aquele"}],
-    "àquela": [{ORTH: "à", NORM: "a"}, {ORTH: "quela", NORM: "aquela"}],
-    "àqueles": [{ORTH: "à", NORM: "a"}, {ORTH: "queles", NORM: "aqueles"}],
-    "àquelas": [{ORTH: "à", NORM: "a"}, {ORTH: "quelas", NORM: "aquelas"}],
-    "àquilo": [{ORTH: "à", NORM: "a"}, {ORTH: "quilo", NORM: "aquilo"}],
-    "aonde": [{ORTH: "a"}, {ORTH: "onde"}],
-}
-
-
-# Contractions
-_per_pron = ["ele", "ela", "eles", "elas"]
-_dem_pron = [
-    "este",
-    "esta",
-    "estes",
-    "estas",
-    "isto",
-    "esse",
-    "essa",
-    "esses",
-    "essas",
-    "isso",
-    "aquele",
-    "aquela",
-    "aqueles",
-    "aquelas",
-    "aquilo",
-]
-_und_pron = ["outro", "outra", "outros", "outras"]
-_adv = ["aqui", "aí", "ali", "além"]
-
-
-for orth in _per_pron + _dem_pron + _und_pron + _adv:
-    _exc["d" + orth] = [{ORTH: "d", NORM: "de"}, {ORTH: orth}]
-
-for orth in _per_pron + _dem_pron + _und_pron:
-    _exc["n" + orth] = [{ORTH: "n", NORM: "em"}, {ORTH: orth}]
+_exc = {}


 for orth in [
    "Adm.",
+    "Art.",
+    "art.",
+    "Av.",
+    "av.",
+    "Cia.",
+    "dom.",
    "Dr.",
+    "dr.",
    "e.g.",
    "E.g.",
    "E.G.",
+    "e/ou",
+    "ed.",
+    "eng.",
+    "etc.",
+    "Fund.",
    "Gen.",
    "Gov.",
    "i.e.",
    "I.e.",
    "I.E.",
+    "Inc.",
    "Jr.",
+    "km/h",
    "Ltd.",
+    "Mr.",
    "p.m.",
    "Ph.D.",
    "Rep.",
    "Rev.",
+    "S/A",
    "Sen.",
    "Sr.",
+    "sr.",
    "Sra.",
+    "sra.",
    "vs.",
    "tel.",
    "pág.",