Improve Catalan tokenization accuracy (#3225)

* small hyphen clean up for French * catalan infix similar to french
2025-10-18 09:44:16 +03:00 · 2019-02-04 10:37:19 +01:00 · 2019-02-04 10:37:19 +01:00 · a3efa3e8d9
commit a3efa3e8d9
parent e00680a33a
5 changed files with 36 additions and 8 deletions
--- a/spacy/lang/ca/init.py
+++ b/spacy/lang/ca/init.py
@ -12,6 +12,8 @@ from ...language import Language
 from ...attrs import LANG, NORM
 from ...util import update_exc, add_lookups

+from .punctuation import TOKENIZER_INFIXES
+

 class CatalanDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
@ -23,6 +25,7 @@ class CatalanDefaults(Language.Defaults):
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    lemma_lookup = LOOKUP
+    infixes = TOKENIZER_INFIXES


 class Catalan(Language):
--- a/spacy/lang/ca/punctuation.py
+++ b/spacy/lang/ca/punctuation.py
@ -0,0 +1,15 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ..punctuation import TOKENIZER_INFIXES
+from ..char_classes import ALPHA
+
+
+ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
+
+
+_infixes = TOKENIZER_INFIXES + [
+    r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
+]
+
+TOKENIZER_INFIXES = _infixes
--- a/spacy/lang/fr/tokenizer_exceptions.py
+++ b/spacy/lang/fr/tokenizer_exceptions.py
@ -317,8 +317,6 @@ _hyphen_prefix = [
    "Vaux",
 ]

-_other_hyphens = "".join([h for h in HYPHENS if h != "-"])
-
 _regular_exp = [
    "^a[{hyphen}]sexualis[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^arginine[{hyphen}]méthyl[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
@ -375,10 +373,9 @@ _regular_exp = [
 ]
 # catching cases like faux-vampire
 _regular_exp += [
-    "^{prefix}[{hyphen}][{al}][{al}{elision}{other_hyphen}\-]*$".format(
+    "^{prefix}[{hyphen}][{al}][{hyphen}{al}{elision}]*$".format(
        prefix=p,
-        hyphen=HYPHENS,
-        other_hyphen=_other_hyphens,
+        hyphen=HYPHENS,   # putting the - first in the [] range avoids having to use a backslash
        elision=ELISION,
        al=ALPHA_LOWER,
    )
@ -388,8 +385,8 @@ _regular_exp += [
 # catching cases like entr'abat
 _elision_prefix = ["r?é?entr", "grande?s?", "r"]
 _regular_exp += [
-    "^{prefix}[{elision}][{al}][{al}{elision}{hyphen}\-]*$".format(
-        prefix=p, elision=ELISION, hyphen=_other_hyphens, al=ALPHA_LOWER
+    "^{prefix}[{elision}][{al}][{hyphen}{al}{elision}]*$".format(
+        prefix=p, elision=ELISION, hyphen=HYPHENS, al=ALPHA_LOWER
    )
    for p in _elision_prefix
 ]
--- a/spacy/tests/lang/ca/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/ca/test_prefix_suffix_infix.py
@ -0,0 +1,13 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+
+@pytest.mark.parametrize(
+    "text,expected_tokens", [("d'un", ["d'", "un"]), ("s'ha", ["s'", "ha"])]
+)
+def test_contractions(ca_tokenizer, text, expected_tokens):
+    """ Test that the contractions are split into two tokens"""
+    tokens = ca_tokenizer(text)
+    assert len(tokens) == 2
--- a/spacy/tests/lang/ca/test_text.py
+++ b/spacy/tests/lang/ca/test_text.py
@ -18,7 +18,7 @@ def test_ca_tokenizer_handles_long_text(ca_tokenizer):
    una gerra de cervesa. Ens asseiem -fotògraf i periodista- en una terrassa buida."""

    tokens = ca_tokenizer(text)
-    assert len(tokens) == 136
+    assert len(tokens) == 138


@pytest.mark.parametrize('text,length', [