Improve tokenization for UD Spanish AnCora (#5253)

2025-08-03 11:50:19 +03:00 · 2020-04-06 13:18:23 +02:00 · 2020-04-06 13:18:23 +02:00 · e8be15e9b7
commit e8be15e9b7
parent f4ef64a526
3 changed files with 54 additions and 1 deletions
--- a/spacy/lang/es/init.py
+++ b/spacy/lang/es/init.py
@ -6,6 +6,7 @@ from .tag_map import TAG_MAP
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
@ -23,6 +24,8 @@ class SpanishDefaults(Language.Defaults):
    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    tag_map = TAG_MAP
+    infixes = TOKENIZER_INFIXES
+    suffixes = TOKENIZER_SUFFIXES
    stop_words = STOP_WORDS
    syntax_iterators = SYNTAX_ITERATORS

--- a/spacy/lang/es/punctuation.py
+++ b/spacy/lang/es/punctuation.py
@ -0,0 +1,48 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
+from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT
+from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
+from ..char_classes import merge_chars
+from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
+
+
+_list_units = [u for u in LIST_UNITS if u != "%"]
+_units = merge_chars(" ".join(_list_units))
+_concat_quotes = CONCAT_QUOTES + "—–"
+
+
+_suffixes = (
+    ["—", "–"]
+    + LIST_PUNCT
+    + LIST_ELLIPSES
+    + LIST_QUOTES
+    + LIST_ICONS
+    + [
+        r"(?<=[0-9])\+",
+        r"(?<=°[FfCcKk])\.",
+        r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
+        r"(?<=[0-9])(?:{u})".format(u=_units),
+        r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format(
+            al=ALPHA_LOWER, e=r"%²\-\+", q=_concat_quotes, p=PUNCT
+        ),
+        r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
+    ]
+)
+
+_infixes = (
+    LIST_ELLIPSES
+    + LIST_ICONS
+    + [
+        r"(?<=[0-9])[+\*^](?=[0-9-])",
+        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
+            al=ALPHA_LOWER, au=ALPHA_UPPER, q=_concat_quotes
+        ),
+        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
+    ]
+)
+
+TOKENIZER_SUFFIXES = _suffixes
+TOKENIZER_INFIXES = _infixes
--- a/spacy/lang/es/tokenizer_exceptions.py
+++ b/spacy/lang/es/tokenizer_exceptions.py
@ -43,14 +43,16 @@ for orth in [
    "Av.",
    "Avda.",
    "Cía.",
+    "EE.UU.",
    "etc.",
+    "fig.",
    "Gob.",
    "Gral.",
    "Ing.",
    "J.C.",
+    "km/h",
    "Lic.",
    "m.n.",
-    "no.",
    "núm.",
    "P.D.",
    "Prof.",