From fe5f5d6ac66c4e9dafd7c14209039f02082e1b3c Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 27 Sep 2021 14:42:30 +0200 Subject: [PATCH] Update Catalan tokenizer (#9297) * Update Makefile For more recent python version * updated for bsc changes New tokenization changes * Update test_text.py * updating tests and requirements * changed failed test in test/lang/ca changed failed test in test/lang/ca * Update .gitignore deleted stashed changes line * back to python 3.6 and remove transformer requirements As per request * Update test_exception.py Change the test * Update test_exception.py Remove test print * Update Makefile For more recent python version * updated for bsc changes New tokenization changes * updating tests and requirements * Update requirements.txt Removed spacy-transfromers from requirements * Update test_exception.py Added final punctuation to ensure consistency * Update Makefile Co-authored-by: Sofie Van Landeghem * Format * Update test to check all tokens Co-authored-by: cayorodriguez Co-authored-by: Sofie Van Landeghem --- spacy/lang/ca/__init__.py | 3 ++- spacy/lang/ca/punctuation.py | 11 ++++++++++ spacy/lang/ca/tokenizer_exceptions.py | 21 +++++++++++++++++++ spacy/tests/lang/ca/test_exception.py | 19 +++++++++++++---- .../tests/lang/ca/test_prefix_suffix_infix.py | 9 +++++++- spacy/tests/lang/ca/test_text.py | 7 +++++-- 6 files changed, 62 insertions(+), 8 deletions(-) mode change 100644 => 100755 spacy/lang/ca/__init__.py mode change 100644 => 100755 spacy/lang/ca/punctuation.py mode change 100644 => 100755 spacy/lang/ca/tokenizer_exceptions.py diff --git a/spacy/lang/ca/__init__.py b/spacy/lang/ca/__init__.py old mode 100644 new mode 100755 index 15d395c12..802c7e4cc --- a/spacy/lang/ca/__init__.py +++ b/spacy/lang/ca/__init__.py @@ -3,7 +3,7 @@ from typing import Optional, Callable from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES +from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS @@ -15,6 +15,7 @@ class CatalanDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES + prefixes = TOKENIZER_PREFIXES stop_words = STOP_WORDS lex_attr_getters = LEX_ATTRS syntax_iterators = SYNTAX_ITERATORS diff --git a/spacy/lang/ca/punctuation.py b/spacy/lang/ca/punctuation.py old mode 100644 new mode 100755 index 39db08f17..8e2f09828 --- a/spacy/lang/ca/punctuation.py +++ b/spacy/lang/ca/punctuation.py @@ -1,4 +1,5 @@ from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS +from ..char_classes import LIST_CURRENCY from ..char_classes import CURRENCY from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT from ..char_classes import merge_chars, _units @@ -6,6 +7,14 @@ from ..char_classes import merge_chars, _units ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "") +_prefixes = ( + ["§", "%", "=", "—", "–", "-", r"\+(?![0-9])"] + + LIST_PUNCT + + LIST_ELLIPSES + + LIST_QUOTES + + LIST_CURRENCY + + LIST_ICONS +) _infixes = ( LIST_ELLIPSES @@ -18,6 +27,7 @@ _infixes = ( r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), r"(?<=[{a}][{el}])(?=[{a}0-9])".format(a=ALPHA, el=ELISION), + r"('ls|'l|'ns|'t|'m|'n|-les|-la|-lo|-li|-los|-me|-nos|-te|-vos|-se|-hi|-ne|-ho)(?![A-Za-z])|(-l'|-m'|-t'|-n')", ] ) @@ -44,3 +54,4 @@ _suffixes = ( TOKENIZER_INFIXES = _infixes TOKENIZER_SUFFIXES = _suffixes +TOKENIZER_PREFIXES = _prefixes diff --git a/spacy/lang/ca/tokenizer_exceptions.py b/spacy/lang/ca/tokenizer_exceptions.py old mode 100644 new mode 100755 index 5f9a50f5e..b261b3498 --- a/spacy/lang/ca/tokenizer_exceptions.py +++ b/spacy/lang/ca/tokenizer_exceptions.py @@ -18,12 +18,21 @@ for exc_data in [ {ORTH: "nov.", NORM: "novembre"}, {ORTH: "dec.", NORM: "desembre"}, {ORTH: "Dr.", NORM: "doctor"}, + {ORTH: "Dra.", NORM: "doctora"}, {ORTH: "Sr.", NORM: "senyor"}, {ORTH: "Sra.", NORM: "senyora"}, {ORTH: "Srta.", NORM: "senyoreta"}, {ORTH: "núm", NORM: "número"}, {ORTH: "St.", NORM: "sant"}, {ORTH: "Sta.", NORM: "santa"}, + {ORTH: "pl.", NORM: "plaça"}, + {ORTH: "à."}, + {ORTH: "è."}, + {ORTH: "é."}, + {ORTH: "í."}, + {ORTH: "ò."}, + {ORTH: "ó."}, + {ORTH: "ú."}, {ORTH: "'l"}, {ORTH: "'ls"}, {ORTH: "'m"}, @@ -34,6 +43,18 @@ for exc_data in [ ]: _exc[exc_data[ORTH]] = [exc_data] +_exc["del"] = [{ORTH: "d", NORM: "de"}, {ORTH: "el"}] +_exc["dels"] = [{ORTH: "d", NORM: "de"}, {ORTH: "els"}] + +_exc["al"] = [{ORTH: "a"}, {ORTH: "l", NORM: "el"}] +_exc["als"] = [{ORTH: "a"}, {ORTH: "ls", NORM: "els"}] + +_exc["pel"] = [{ORTH: "p", NORM: "per"}, {ORTH: "el"}] +_exc["pels"] = [{ORTH: "p", NORM: "per"}, {ORTH: "els"}] + +_exc["holahola"] = [{ORTH: "holahola", NORM: "cocacola"}] + + # Times _exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", NORM: "p.m."}] diff --git a/spacy/tests/lang/ca/test_exception.py b/spacy/tests/lang/ca/test_exception.py index cfb574b63..499027ab1 100644 --- a/spacy/tests/lang/ca/test_exception.py +++ b/spacy/tests/lang/ca/test_exception.py @@ -11,7 +11,18 @@ def test_ca_tokenizer_handles_abbr(ca_tokenizer, text, lemma): def test_ca_tokenizer_handles_exc_in_text(ca_tokenizer): - text = "La Núria i el Pere han vingut aprox. a les 7 de la tarda." - tokens = ca_tokenizer(text) - assert len(tokens) == 15 - assert tokens[7].text == "aprox." + text = "La Dra. Puig viu a la pl. dels Til·lers." + doc = ca_tokenizer(text) + assert [t.text for t in doc] == [ + "La", + "Dra.", + "Puig", + "viu", + "a", + "la", + "pl.", + "d", + "els", + "Til·lers", + ".", + ] diff --git a/spacy/tests/lang/ca/test_prefix_suffix_infix.py b/spacy/tests/lang/ca/test_prefix_suffix_infix.py index a3c76ab5b..afbdf3696 100644 --- a/spacy/tests/lang/ca/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/ca/test_prefix_suffix_infix.py @@ -2,7 +2,14 @@ import pytest @pytest.mark.parametrize( - "text,expected_tokens", [("d'un", ["d'", "un"]), ("s'ha", ["s'", "ha"])] + "text,expected_tokens", + [ + ("d'un", ["d'", "un"]), + ("s'ha", ["s'", "ha"]), + ("del", ["d", "el"]), + ("cantar-te", ["cantar", "-te"]), + ("-hola", ["-", "hola"]), + ], ) def test_contractions(ca_tokenizer, text, expected_tokens): """Test that the contractions are split into two tokens""" diff --git a/spacy/tests/lang/ca/test_text.py b/spacy/tests/lang/ca/test_text.py index 55bad0e94..5db7af553 100644 --- a/spacy/tests/lang/ca/test_text.py +++ b/spacy/tests/lang/ca/test_text.py @@ -12,17 +12,20 @@ def test_ca_tokenizer_handles_long_text(ca_tokenizer): una gerra de cervesa. Ens asseiem -fotògraf i periodista- en una terrassa buida.""" tokens = ca_tokenizer(text) - assert len(tokens) == 140 + assert len(tokens) == 146 @pytest.mark.parametrize( "text,length", [ - ("Perquè va anar-hi?", 4), + ("Perquè va anar-hi?", 5), + ("El cotxe dels veins.", 6), ("“Ah no?”", 5), ("""Sí! "Anem", va contestar el Joan Carles""", 11), ("Van córrer aprox. 10km", 5), ("Llavors perqué...", 3), + ("Vull parlar-te'n demà al matí", 8), + ("Vull explicar-t'ho demà al matí", 8), ], ) def test_ca_tokenizer_handles_cnts(ca_tokenizer, text, length):