mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
Modifications/updates to Portuguese tokenization (#5203)
Modifications to Portuguese tokenization for UD_Portuguese-Bosque. Instead of splitting contactions as exceptions, they are kept as merged tokens.
This commit is contained in:
parent
4117a5c705
commit
923a453449
|
@ -4,69 +4,47 @@ from __future__ import unicode_literals
|
|||
from ...symbols import ORTH, NORM
|
||||
|
||||
|
||||
_exc = {
|
||||
"às": [{ORTH: "à", NORM: "a"}, {ORTH: "s", NORM: "as"}],
|
||||
"ao": [{ORTH: "a"}, {ORTH: "o"}],
|
||||
"aos": [{ORTH: "a"}, {ORTH: "os"}],
|
||||
"àquele": [{ORTH: "à", NORM: "a"}, {ORTH: "quele", NORM: "aquele"}],
|
||||
"àquela": [{ORTH: "à", NORM: "a"}, {ORTH: "quela", NORM: "aquela"}],
|
||||
"àqueles": [{ORTH: "à", NORM: "a"}, {ORTH: "queles", NORM: "aqueles"}],
|
||||
"àquelas": [{ORTH: "à", NORM: "a"}, {ORTH: "quelas", NORM: "aquelas"}],
|
||||
"àquilo": [{ORTH: "à", NORM: "a"}, {ORTH: "quilo", NORM: "aquilo"}],
|
||||
"aonde": [{ORTH: "a"}, {ORTH: "onde"}],
|
||||
}
|
||||
|
||||
|
||||
# Contractions
|
||||
_per_pron = ["ele", "ela", "eles", "elas"]
|
||||
_dem_pron = [
|
||||
"este",
|
||||
"esta",
|
||||
"estes",
|
||||
"estas",
|
||||
"isto",
|
||||
"esse",
|
||||
"essa",
|
||||
"esses",
|
||||
"essas",
|
||||
"isso",
|
||||
"aquele",
|
||||
"aquela",
|
||||
"aqueles",
|
||||
"aquelas",
|
||||
"aquilo",
|
||||
]
|
||||
_und_pron = ["outro", "outra", "outros", "outras"]
|
||||
_adv = ["aqui", "aí", "ali", "além"]
|
||||
|
||||
|
||||
for orth in _per_pron + _dem_pron + _und_pron + _adv:
|
||||
_exc["d" + orth] = [{ORTH: "d", NORM: "de"}, {ORTH: orth}]
|
||||
|
||||
for orth in _per_pron + _dem_pron + _und_pron:
|
||||
_exc["n" + orth] = [{ORTH: "n", NORM: "em"}, {ORTH: orth}]
|
||||
_exc = {}
|
||||
|
||||
|
||||
for orth in [
|
||||
"Adm.",
|
||||
"Art.",
|
||||
"art.",
|
||||
"Av.",
|
||||
"av.",
|
||||
"Cia.",
|
||||
"dom.",
|
||||
"Dr.",
|
||||
"dr.",
|
||||
"e.g.",
|
||||
"E.g.",
|
||||
"E.G.",
|
||||
"e/ou",
|
||||
"ed.",
|
||||
"eng.",
|
||||
"etc.",
|
||||
"Fund.",
|
||||
"Gen.",
|
||||
"Gov.",
|
||||
"i.e.",
|
||||
"I.e.",
|
||||
"I.E.",
|
||||
"Inc.",
|
||||
"Jr.",
|
||||
"km/h",
|
||||
"Ltd.",
|
||||
"Mr.",
|
||||
"p.m.",
|
||||
"Ph.D.",
|
||||
"Rep.",
|
||||
"Rev.",
|
||||
"S/A",
|
||||
"Sen.",
|
||||
"Sr.",
|
||||
"sr.",
|
||||
"Sra.",
|
||||
"sra.",
|
||||
"vs.",
|
||||
"tel.",
|
||||
"pág.",
|
||||
|
|
Loading…
Reference in New Issue
Block a user