Improve Italian tokenization (#5204)

Improve Italian tokenization for UD_Italian-ISDT.
This commit is contained in:
adrianeboyd 2020-03-25 11:28:02 +01:00 committed by GitHub
parent 923a453449
commit 1a944e5976
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 83 additions and 8 deletions

View File

@ -4,7 +4,7 @@ from __future__ import unicode_literals
from .stop_words import STOP_WORDS
from .tag_map import TAG_MAP
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
@ -22,6 +22,7 @@ class ItalianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
tag_map = TAG_MAP
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES

View File

@ -1,15 +1,39 @@
# coding: utf8
from __future__ import unicode_literals
from ..punctuation import TOKENIZER_INFIXES
from ..char_classes import ALPHA
from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
from ..char_classes import ALPHA, HYPHENS, CONCAT_QUOTES
from ..char_classes import ALPHA_LOWER, ALPHA_UPPER
ELISION = " ' ".strip().replace(" ", "")
ELISION = "'"
_infixes = TOKENIZER_INFIXES + [
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
]
_prefixes = (
[
r"'[0-9][0-9]",
r"[0-9]+°",
]
+ TOKENIZER_PREFIXES
)
_infixes = (
LIST_ELLIPSES
+ LIST_ICONS
+ [
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])(?:{h})(?=[{al}])".format(a=ALPHA, h=HYPHENS, al=ALPHA_LOWER),
r"(?<=[{a}0-9])[:<>=\/](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}][{el}])(?=[{a}0-9\"])".format(a=ALPHA, el=ELISION)
]
)
TOKENIZER_PREFIXES = _prefixes
TOKENIZER_INFIXES = _infixes

View File

@ -2,6 +2,56 @@
from __future__ import unicode_literals
from ...symbols import ORTH, LEMMA
_exc = {"po'": [{ORTH: "po'", LEMMA: "poco"}]}
_exc = {
"all'art.": [{ORTH: "all'"}, {ORTH: "art."}],
"dall'art.": [{ORTH: "dall'"}, {ORTH: "art."}],
"dell'art.": [{ORTH: "dell'"}, {ORTH: "art."}],
"L'art.": [{ORTH: "L'"}, {ORTH: "art."}],
"l'art.": [{ORTH: "l'"}, {ORTH: "art."}],
"nell'art.": [{ORTH: "nell'"}, {ORTH: "art."}],
"po'": [{ORTH: "po'", LEMMA: "poco"}],
"sett..": [{ORTH: "sett."}, {ORTH: "."}]
}
for orth in [
"..",
"....",
"al.",
"all-path",
"art.",
"Art.",
"artt.",
"att.",
"by-pass",
"c.d.",
"centro-sinistra",
"check-up",
"Civ.",
"cm.",
"Cod.",
"col.",
"Cost.",
"d.C.",
'de"'
"distr.",
"E'",
"ecc.",
"e-mail",
"e/o",
"etc.",
"Jr.",
"",
"nord-est",
"pag.",
"Proc.",
"prof.",
"sett.",
"s.p.a.",
"ss.",
"St.",
"tel.",
"week-end",
]:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc