Update Catalan tokenizer (#9297)

* Update Makefile For more recent python version * updated for bsc changes New tokenization changes * Update test_text.py * updating tests and requirements * changed failed test in test/lang/ca changed failed test in test/lang/ca * Update .gitignore deleted stashed changes line * back to python 3.6 and remove transformer requirements As per request * Update test_exception.py Change the test * Update test_exception.py Remove test print * Update Makefile For more recent python version * updated for bsc changes New tokenization changes * updating tests and requirements * Update requirements.txt Removed spacy-transfromers from requirements * Update test_exception.py Added final punctuation to ensure consistency * Update Makefile Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Format * Update test to check all tokens Co-authored-by: cayorodriguez <crodriguezp@gmail.com> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2025-10-18 01:34:16 +03:00 · 2021-09-27 14:42:30 +02:00 · 2021-09-27 14:42:30 +02:00 · fe5f5d6ac6
commit fe5f5d6ac6
parent 200121a035
6 changed files with 62 additions and 8 deletions
--- a/spacy/lang/ca/init.py
+++ b/spacy/lang/ca/init.py
@ -3,7 +3,7 @@ from typing import Optional, Callable
 from thinc.api import Model

 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
@ -15,6 +15,7 @@ class CatalanDefaults(Language.Defaults):
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    infixes = TOKENIZER_INFIXES
    suffixes = TOKENIZER_SUFFIXES
+    prefixes = TOKENIZER_PREFIXES
    stop_words = STOP_WORDS
    lex_attr_getters = LEX_ATTRS
    syntax_iterators = SYNTAX_ITERATORS
--- a/spacy/lang/ca/punctuation.py
+++ b/spacy/lang/ca/punctuation.py
@ -1,4 +1,5 @@
 from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
+from ..char_classes import LIST_CURRENCY
 from ..char_classes import CURRENCY
 from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
 from ..char_classes import merge_chars, _units
@ -6,6 +7,14 @@ from ..char_classes import merge_chars, _units

 ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")

+_prefixes = (
+    ["§", "%", "=", "—", "–", "-", r"\+(?![0-9])"]
+    + LIST_PUNCT
+    + LIST_ELLIPSES
+    + LIST_QUOTES
+    + LIST_CURRENCY
+    + LIST_ICONS
+)

 _infixes = (
    LIST_ELLIPSES
@ -18,6 +27,7 @@ _infixes = (
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}][{el}])(?=[{a}0-9])".format(a=ALPHA, el=ELISION),
+        r"('ls|'l|'ns|'t|'m|'n|-les|-la|-lo|-li|-los|-me|-nos|-te|-vos|-se|-hi|-ne|-ho)(?![A-Za-z])|(-l'|-m'|-t'|-n')",
    ]
 )

@ -44,3 +54,4 @@ _suffixes = (

 TOKENIZER_INFIXES = _infixes
 TOKENIZER_SUFFIXES = _suffixes
+TOKENIZER_PREFIXES = _prefixes
--- a/spacy/lang/ca/tokenizer_exceptions.py
+++ b/spacy/lang/ca/tokenizer_exceptions.py
@ -18,12 +18,21 @@ for exc_data in [
    {ORTH: "nov.", NORM: "novembre"},
    {ORTH: "dec.", NORM: "desembre"},
    {ORTH: "Dr.", NORM: "doctor"},
+    {ORTH: "Dra.", NORM: "doctora"},
    {ORTH: "Sr.", NORM: "senyor"},
    {ORTH: "Sra.", NORM: "senyora"},
    {ORTH: "Srta.", NORM: "senyoreta"},
    {ORTH: "núm", NORM: "número"},
    {ORTH: "St.", NORM: "sant"},
    {ORTH: "Sta.", NORM: "santa"},
+    {ORTH: "pl.", NORM: "plaça"},
+    {ORTH: "à."},
+    {ORTH: "è."},
+    {ORTH: "é."},
+    {ORTH: "í."},
+    {ORTH: "ò."},
+    {ORTH: "ó."},
+    {ORTH: "ú."},
    {ORTH: "'l"},
    {ORTH: "'ls"},
    {ORTH: "'m"},
@ -34,6 +43,18 @@ for exc_data in [
 ]:
    _exc[exc_data[ORTH]] = [exc_data]

+_exc["del"] = [{ORTH: "d", NORM: "de"}, {ORTH: "el"}]
+_exc["dels"] = [{ORTH: "d", NORM: "de"}, {ORTH: "els"}]
+
+_exc["al"] = [{ORTH: "a"}, {ORTH: "l", NORM: "el"}]
+_exc["als"] = [{ORTH: "a"}, {ORTH: "ls", NORM: "els"}]
+
+_exc["pel"] = [{ORTH: "p", NORM: "per"}, {ORTH: "el"}]
+_exc["pels"] = [{ORTH: "p", NORM: "per"}, {ORTH: "els"}]
+
+_exc["holahola"] = [{ORTH: "holahola", NORM: "cocacola"}]
+
+
 # Times
 _exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", NORM: "p.m."}]

--- a/spacy/tests/lang/ca/test_exception.py
+++ b/spacy/tests/lang/ca/test_exception.py
@ -11,7 +11,18 @@ def test_ca_tokenizer_handles_abbr(ca_tokenizer, text, lemma):


 def test_ca_tokenizer_handles_exc_in_text(ca_tokenizer):
-    text = "La Núria i el Pere han vingut aprox. a les 7 de la tarda."
-    tokens = ca_tokenizer(text)
-    assert len(tokens) == 15
-    assert tokens[7].text == "aprox."
+    text = "La Dra. Puig viu a la pl. dels Til·lers."
+    doc = ca_tokenizer(text)
+    assert [t.text for t in doc] == [
+        "La",
+        "Dra.",
+        "Puig",
+        "viu",
+        "a",
+        "la",
+        "pl.",
+        "d",
+        "els",
+        "Til·lers",
+        ".",
+    ]
--- a/spacy/tests/lang/ca/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/ca/test_prefix_suffix_infix.py
@ -2,7 +2,14 @@ import pytest


@pytest.mark.parametrize(
-    "text,expected_tokens", [("d'un", ["d'", "un"]), ("s'ha", ["s'", "ha"])]
+    "text,expected_tokens",
+    [
+        ("d'un", ["d'", "un"]),
+        ("s'ha", ["s'", "ha"]),
+        ("del", ["d", "el"]),
+        ("cantar-te", ["cantar", "-te"]),
+        ("-hola", ["-", "hola"]),
+    ],
 )
 def test_contractions(ca_tokenizer, text, expected_tokens):
    """Test that the contractions are split into two tokens"""
--- a/spacy/tests/lang/ca/test_text.py
+++ b/spacy/tests/lang/ca/test_text.py
@ -12,17 +12,20 @@ def test_ca_tokenizer_handles_long_text(ca_tokenizer):
    una gerra de cervesa. Ens asseiem -fotògraf i periodista- en una terrassa buida."""

    tokens = ca_tokenizer(text)
-    assert len(tokens) == 140
+    assert len(tokens) == 146


@pytest.mark.parametrize(
    "text,length",
    [
-        ("Perquè va anar-hi?", 4),
+        ("Perquè va anar-hi?", 5),
+        ("El cotxe dels veins.", 6),
        ("“Ah no?”", 5),
        ("""Sí! "Anem", va contestar el Joan Carles""", 11),
        ("Van córrer aprox. 10km", 5),
        ("Llavors perqué...", 3),
+        ("Vull parlar-te'n demà al matí", 8),
+        ("Vull explicar-t'ho demà al matí", 8),
    ],
 )
 def test_ca_tokenizer_handles_cnts(ca_tokenizer, text, length):