From fe5f5d6ac66c4e9dafd7c14209039f02082e1b3c Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 27 Sep 2021 14:42:30 +0200
Subject: [PATCH] Update Catalan tokenizer (#9297)

* Update Makefile

For more recent python version

* updated for bsc changes

New tokenization changes

* Update test_text.py

* updating tests and requirements

* changed failed test in test/lang/ca

changed failed test in test/lang/ca

* Update .gitignore

deleted stashed changes line

* back to python 3.6 and remove transformer requirements

As per request

* Update test_exception.py

Change the test

* Update test_exception.py

Remove test print

* Update Makefile

For more recent python version

* updated for bsc changes

New tokenization changes

* updating tests and requirements

* Update requirements.txt

Removed spacy-transfromers from requirements

* Update test_exception.py

Added final punctuation to ensure consistency

* Update Makefile

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Format

* Update test to check all tokens

Co-authored-by: cayorodriguez <crodriguezp@gmail.com>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/lang/ca/__init__.py                     |  3 ++-
 spacy/lang/ca/punctuation.py                  | 11 ++++++++++
 spacy/lang/ca/tokenizer_exceptions.py         | 21 +++++++++++++++++++
 spacy/tests/lang/ca/test_exception.py         | 19 +++++++++++++----
 .../tests/lang/ca/test_prefix_suffix_infix.py |  9 +++++++-
 spacy/tests/lang/ca/test_text.py              |  7 +++++--
 6 files changed, 62 insertions(+), 8 deletions(-)
 mode change 100644 => 100755 spacy/lang/ca/__init__.py
 mode change 100644 => 100755 spacy/lang/ca/punctuation.py
 mode change 100644 => 100755 spacy/lang/ca/tokenizer_exceptions.py

diff --git a/spacy/lang/ca/__init__.py b/spacy/lang/ca/__init__.py
old mode 100644
new mode 100755
index 15d395c12..802c7e4cc
--- a/spacy/lang/ca/__init__.py
+++ b/spacy/lang/ca/__init__.py
@@ -3,7 +3,7 @@ from typing import Optional, Callable
 from thinc.api import Model
 
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
@@ -15,6 +15,7 @@ class CatalanDefaults(Language.Defaults):
     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
     infixes = TOKENIZER_INFIXES
     suffixes = TOKENIZER_SUFFIXES
+    prefixes = TOKENIZER_PREFIXES
     stop_words = STOP_WORDS
     lex_attr_getters = LEX_ATTRS
     syntax_iterators = SYNTAX_ITERATORS
diff --git a/spacy/lang/ca/punctuation.py b/spacy/lang/ca/punctuation.py
old mode 100644
new mode 100755
index 39db08f17..8e2f09828
--- a/spacy/lang/ca/punctuation.py
+++ b/spacy/lang/ca/punctuation.py
@@ -1,4 +1,5 @@
 from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
+from ..char_classes import LIST_CURRENCY
 from ..char_classes import CURRENCY
 from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
 from ..char_classes import merge_chars, _units
@@ -6,6 +7,14 @@ from ..char_classes import merge_chars, _units
 
 ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
 
+_prefixes = (
+    ["§", "%", "=", "—", "–", "-", r"\+(?![0-9])"]
+    + LIST_PUNCT
+    + LIST_ELLIPSES
+    + LIST_QUOTES
+    + LIST_CURRENCY
+    + LIST_ICONS
+)
 
 _infixes = (
     LIST_ELLIPSES
@@ -18,6 +27,7 @@ _infixes = (
         r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
         r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
         r"(?<=[{a}][{el}])(?=[{a}0-9])".format(a=ALPHA, el=ELISION),
+        r"('ls|'l|'ns|'t|'m|'n|-les|-la|-lo|-li|-los|-me|-nos|-te|-vos|-se|-hi|-ne|-ho)(?![A-Za-z])|(-l'|-m'|-t'|-n')",
     ]
 )
 
@@ -44,3 +54,4 @@ _suffixes = (
 
 TOKENIZER_INFIXES = _infixes
 TOKENIZER_SUFFIXES = _suffixes
+TOKENIZER_PREFIXES = _prefixes
diff --git a/spacy/lang/ca/tokenizer_exceptions.py b/spacy/lang/ca/tokenizer_exceptions.py
old mode 100644
new mode 100755
index 5f9a50f5e..b261b3498
--- a/spacy/lang/ca/tokenizer_exceptions.py
+++ b/spacy/lang/ca/tokenizer_exceptions.py
@@ -18,12 +18,21 @@ for exc_data in [
     {ORTH: "nov.", NORM: "novembre"},
     {ORTH: "dec.", NORM: "desembre"},
     {ORTH: "Dr.", NORM: "doctor"},
+    {ORTH: "Dra.", NORM: "doctora"},
     {ORTH: "Sr.", NORM: "senyor"},
     {ORTH: "Sra.", NORM: "senyora"},
     {ORTH: "Srta.", NORM: "senyoreta"},
     {ORTH: "núm", NORM: "número"},
     {ORTH: "St.", NORM: "sant"},
     {ORTH: "Sta.", NORM: "santa"},
+    {ORTH: "pl.", NORM: "plaça"},
+    {ORTH: "à."},
+    {ORTH: "è."},
+    {ORTH: "é."},
+    {ORTH: "í."},
+    {ORTH: "ò."},
+    {ORTH: "ó."},
+    {ORTH: "ú."},
     {ORTH: "'l"},
     {ORTH: "'ls"},
     {ORTH: "'m"},
@@ -34,6 +43,18 @@ for exc_data in [
 ]:
     _exc[exc_data[ORTH]] = [exc_data]
 
+_exc["del"] = [{ORTH: "d", NORM: "de"}, {ORTH: "el"}]
+_exc["dels"] = [{ORTH: "d", NORM: "de"}, {ORTH: "els"}]
+
+_exc["al"] = [{ORTH: "a"}, {ORTH: "l", NORM: "el"}]
+_exc["als"] = [{ORTH: "a"}, {ORTH: "ls", NORM: "els"}]
+
+_exc["pel"] = [{ORTH: "p", NORM: "per"}, {ORTH: "el"}]
+_exc["pels"] = [{ORTH: "p", NORM: "per"}, {ORTH: "els"}]
+
+_exc["holahola"] = [{ORTH: "holahola", NORM: "cocacola"}]
+
+
 # Times
 _exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", NORM: "p.m."}]
 
diff --git a/spacy/tests/lang/ca/test_exception.py b/spacy/tests/lang/ca/test_exception.py
index cfb574b63..499027ab1 100644
--- a/spacy/tests/lang/ca/test_exception.py
+++ b/spacy/tests/lang/ca/test_exception.py
@@ -11,7 +11,18 @@ def test_ca_tokenizer_handles_abbr(ca_tokenizer, text, lemma):
 
 
 def test_ca_tokenizer_handles_exc_in_text(ca_tokenizer):
-    text = "La Núria i el Pere han vingut aprox. a les 7 de la tarda."
-    tokens = ca_tokenizer(text)
-    assert len(tokens) == 15
-    assert tokens[7].text == "aprox."
+    text = "La Dra. Puig viu a la pl. dels Til·lers."
+    doc = ca_tokenizer(text)
+    assert [t.text for t in doc] == [
+        "La",
+        "Dra.",
+        "Puig",
+        "viu",
+        "a",
+        "la",
+        "pl.",
+        "d",
+        "els",
+        "Til·lers",
+        ".",
+    ]
diff --git a/spacy/tests/lang/ca/test_prefix_suffix_infix.py b/spacy/tests/lang/ca/test_prefix_suffix_infix.py
index a3c76ab5b..afbdf3696 100644
--- a/spacy/tests/lang/ca/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/ca/test_prefix_suffix_infix.py
@@ -2,7 +2,14 @@ import pytest
 
 
 @pytest.mark.parametrize(
-    "text,expected_tokens", [("d'un", ["d'", "un"]), ("s'ha", ["s'", "ha"])]
+    "text,expected_tokens",
+    [
+        ("d'un", ["d'", "un"]),
+        ("s'ha", ["s'", "ha"]),
+        ("del", ["d", "el"]),
+        ("cantar-te", ["cantar", "-te"]),
+        ("-hola", ["-", "hola"]),
+    ],
 )
 def test_contractions(ca_tokenizer, text, expected_tokens):
     """Test that the contractions are split into two tokens"""
diff --git a/spacy/tests/lang/ca/test_text.py b/spacy/tests/lang/ca/test_text.py
index 55bad0e94..5db7af553 100644
--- a/spacy/tests/lang/ca/test_text.py
+++ b/spacy/tests/lang/ca/test_text.py
@@ -12,17 +12,20 @@ def test_ca_tokenizer_handles_long_text(ca_tokenizer):
     una gerra de cervesa. Ens asseiem -fotògraf i periodista- en una terrassa buida."""
 
     tokens = ca_tokenizer(text)
-    assert len(tokens) == 140
+    assert len(tokens) == 146
 
 
 @pytest.mark.parametrize(
     "text,length",
     [
-        ("Perquè va anar-hi?", 4),
+        ("Perquè va anar-hi?", 5),
+        ("El cotxe dels veins.", 6),
         ("“Ah no?”", 5),
         ("""Sí! "Anem", va contestar el Joan Carles""", 11),
         ("Van córrer aprox. 10km", 5),
         ("Llavors perqué...", 3),
+        ("Vull parlar-te'n demà al matí", 8),
+        ("Vull explicar-t'ho demà al matí", 8),
     ],
 )
 def test_ca_tokenizer_handles_cnts(ca_tokenizer, text, length):