Improve Italian tokenization (#5204)

Improve Italian tokenization for UD_Italian-ISDT.
2025-07-31 10:29:46 +03:00 · 2020-03-25 11:28:02 +01:00 · 2020-03-25 11:28:02 +01:00 · 1a944e5976
commit 1a944e5976
parent 923a453449
3 changed files with 83 additions and 8 deletions
--- a/spacy/lang/it/init.py
+++ b/spacy/lang/it/init.py
@ -4,7 +4,7 @@ from __future__ import unicode_literals
 from .stop_words import STOP_WORDS
 from .tag_map import TAG_MAP
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_INFIXES
+from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
@ -22,6 +22,7 @@ class ItalianDefaults(Language.Defaults):
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    tag_map = TAG_MAP
+    prefixes = TOKENIZER_PREFIXES
    infixes = TOKENIZER_INFIXES


--- a/spacy/lang/it/punctuation.py
+++ b/spacy/lang/it/punctuation.py
@ -1,15 +1,39 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ..punctuation import TOKENIZER_INFIXES
-from ..char_classes import ALPHA
+from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
+from ..char_classes import LIST_ELLIPSES, LIST_ICONS
+from ..char_classes import ALPHA, HYPHENS, CONCAT_QUOTES
+from ..char_classes import ALPHA_LOWER, ALPHA_UPPER


-ELISION = " ' ’ ".strip().replace(" ", "")
+ELISION = "'’"


-_infixes = TOKENIZER_INFIXES + [
-    r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
-]
+_prefixes = (
+    [
+        r"'[0-9][0-9]",
+        r"[0-9]+°",

+    ]
+    + TOKENIZER_PREFIXES
+)
+
+
+_infixes = (
+    LIST_ELLIPSES
+    + LIST_ICONS
+    + [
+        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
+        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
+            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
+        ),
+        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}])(?:{h})(?=[{al}])".format(a=ALPHA, h=HYPHENS, al=ALPHA_LOWER),
+        r"(?<=[{a}0-9])[:<>=\/](?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}][{el}])(?=[{a}0-9\"])".format(a=ALPHA, el=ELISION)
+    ]
+)
+
+TOKENIZER_PREFIXES = _prefixes
 TOKENIZER_INFIXES = _infixes
--- a/spacy/lang/it/tokenizer_exceptions.py
+++ b/spacy/lang/it/tokenizer_exceptions.py
@ -2,6 +2,56 @@
 from __future__ import unicode_literals
 from ...symbols import ORTH, LEMMA

-_exc = {"po'": [{ORTH: "po'", LEMMA: "poco"}]}
+_exc = {
+    "all'art.": [{ORTH: "all'"}, {ORTH: "art."}],
+    "dall'art.": [{ORTH: "dall'"}, {ORTH: "art."}],
+    "dell'art.": [{ORTH: "dell'"}, {ORTH: "art."}],
+    "L'art.": [{ORTH: "L'"}, {ORTH: "art."}],
+    "l'art.": [{ORTH: "l'"}, {ORTH: "art."}],
+    "nell'art.": [{ORTH: "nell'"}, {ORTH: "art."}],
+    "po'": [{ORTH: "po'", LEMMA: "poco"}],
+    "sett..": [{ORTH: "sett."}, {ORTH: "."}]
+}
+
+for orth in [
+    "..",
+    "....",
+    "al.",
+    "all-path",
+    "art.",
+    "Art.",
+    "artt.",
+    "att.",
+    "by-pass",
+    "c.d.",
+    "centro-sinistra",
+    "check-up",
+    "Civ.",
+    "cm.",
+    "Cod.",
+    "col.",
+    "Cost.",
+    "d.C.",
+    'de"'
+    "distr.",
+    "E'",
+    "ecc.",
+    "e-mail",
+    "e/o",
+    "etc.",
+    "Jr.",
+    "n°",
+    "nord-est",
+    "pag.",
+    "Proc.",
+    "prof.",
+    "sett.",
+    "s.p.a.",
+    "ss.",
+    "St.",
+    "tel.",
+    "week-end",
+]:
+    _exc[orth] = [{ORTH: orth}]

 TOKENIZER_EXCEPTIONS = _exc