spaCy/spacy/lang/it/tokenizer_exceptions.py
adrianeboyd 1a944e5976
Improve Italian tokenization (#5204)
Improve Italian tokenization for UD_Italian-ISDT.
2020-03-25 11:28:02 +01:00

58 lines
1.0 KiB
Python

# coding: utf8
from __future__ import unicode_literals
from ...symbols import ORTH, LEMMA
_exc = {
"all'art.": [{ORTH: "all'"}, {ORTH: "art."}],
"dall'art.": [{ORTH: "dall'"}, {ORTH: "art."}],
"dell'art.": [{ORTH: "dell'"}, {ORTH: "art."}],
"L'art.": [{ORTH: "L'"}, {ORTH: "art."}],
"l'art.": [{ORTH: "l'"}, {ORTH: "art."}],
"nell'art.": [{ORTH: "nell'"}, {ORTH: "art."}],
"po'": [{ORTH: "po'", LEMMA: "poco"}],
"sett..": [{ORTH: "sett."}, {ORTH: "."}]
}
for orth in [
"..",
"....",
"al.",
"all-path",
"art.",
"Art.",
"artt.",
"att.",
"by-pass",
"c.d.",
"centro-sinistra",
"check-up",
"Civ.",
"cm.",
"Cod.",
"col.",
"Cost.",
"d.C.",
'de"'
"distr.",
"E'",
"ecc.",
"e-mail",
"e/o",
"etc.",
"Jr.",
"",
"nord-est",
"pag.",
"Proc.",
"prof.",
"sett.",
"s.p.a.",
"ss.",
"St.",
"tel.",
"week-end",
]:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc