Update Catalan tokenizer (#9297)

* Update Makefile

For more recent python version

* updated for bsc changes

New tokenization changes

* Update test_text.py

* updating tests and requirements

* changed failed test in test/lang/ca

changed failed test in test/lang/ca

* Update .gitignore

deleted stashed changes line

* back to python 3.6 and remove transformer requirements

As per request

* Update test_exception.py

Change the test

* Update test_exception.py

Remove test print

* Update Makefile

For more recent python version

* updated for bsc changes

New tokenization changes

* updating tests and requirements

* Update requirements.txt

Removed spacy-transfromers from requirements

* Update test_exception.py

Added final punctuation to ensure consistency

* Update Makefile

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Format

* Update test to check all tokens

Co-authored-by: cayorodriguez <crodriguezp@gmail.com>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
Adriane Boyd 2021-09-27 14:42:30 +02:00 committed by GitHub
parent 200121a035
commit fe5f5d6ac6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 62 additions and 8 deletions

3
spacy/lang/ca/__init__.py Normal file → Executable file
View File

@ -3,7 +3,7 @@ from typing import Optional, Callable
from thinc.api import Model from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
@ -15,6 +15,7 @@ class CatalanDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
prefixes = TOKENIZER_PREFIXES
stop_words = STOP_WORDS stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS

11
spacy/lang/ca/punctuation.py Normal file → Executable file
View File

@ -1,4 +1,5 @@
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
from ..char_classes import LIST_CURRENCY
from ..char_classes import CURRENCY from ..char_classes import CURRENCY
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
from ..char_classes import merge_chars, _units from ..char_classes import merge_chars, _units
@ -6,6 +7,14 @@ from ..char_classes import merge_chars, _units
ELISION = " ' ".strip().replace(" ", "").replace("\n", "") ELISION = " ' ".strip().replace(" ", "").replace("\n", "")
_prefixes = (
["§", "%", "=", "", "", "-", r"\+(?![0-9])"]
+ LIST_PUNCT
+ LIST_ELLIPSES
+ LIST_QUOTES
+ LIST_CURRENCY
+ LIST_ICONS
)
_infixes = ( _infixes = (
LIST_ELLIPSES LIST_ELLIPSES
@ -18,6 +27,7 @@ _infixes = (
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}][{el}])(?=[{a}0-9])".format(a=ALPHA, el=ELISION), r"(?<=[{a}][{el}])(?=[{a}0-9])".format(a=ALPHA, el=ELISION),
r"('ls|'l|'ns|'t|'m|'n|-les|-la|-lo|-li|-los|-me|-nos|-te|-vos|-se|-hi|-ne|-ho)(?![A-Za-z])|(-l'|-m'|-t'|-n')",
] ]
) )
@ -44,3 +54,4 @@ _suffixes = (
TOKENIZER_INFIXES = _infixes TOKENIZER_INFIXES = _infixes
TOKENIZER_SUFFIXES = _suffixes TOKENIZER_SUFFIXES = _suffixes
TOKENIZER_PREFIXES = _prefixes

21
spacy/lang/ca/tokenizer_exceptions.py Normal file → Executable file
View File

@ -18,12 +18,21 @@ for exc_data in [
{ORTH: "nov.", NORM: "novembre"}, {ORTH: "nov.", NORM: "novembre"},
{ORTH: "dec.", NORM: "desembre"}, {ORTH: "dec.", NORM: "desembre"},
{ORTH: "Dr.", NORM: "doctor"}, {ORTH: "Dr.", NORM: "doctor"},
{ORTH: "Dra.", NORM: "doctora"},
{ORTH: "Sr.", NORM: "senyor"}, {ORTH: "Sr.", NORM: "senyor"},
{ORTH: "Sra.", NORM: "senyora"}, {ORTH: "Sra.", NORM: "senyora"},
{ORTH: "Srta.", NORM: "senyoreta"}, {ORTH: "Srta.", NORM: "senyoreta"},
{ORTH: "núm", NORM: "número"}, {ORTH: "núm", NORM: "número"},
{ORTH: "St.", NORM: "sant"}, {ORTH: "St.", NORM: "sant"},
{ORTH: "Sta.", NORM: "santa"}, {ORTH: "Sta.", NORM: "santa"},
{ORTH: "pl.", NORM: "plaça"},
{ORTH: "à."},
{ORTH: "è."},
{ORTH: "é."},
{ORTH: "í."},
{ORTH: "ò."},
{ORTH: "ó."},
{ORTH: "ú."},
{ORTH: "'l"}, {ORTH: "'l"},
{ORTH: "'ls"}, {ORTH: "'ls"},
{ORTH: "'m"}, {ORTH: "'m"},
@ -34,6 +43,18 @@ for exc_data in [
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
_exc["del"] = [{ORTH: "d", NORM: "de"}, {ORTH: "el"}]
_exc["dels"] = [{ORTH: "d", NORM: "de"}, {ORTH: "els"}]
_exc["al"] = [{ORTH: "a"}, {ORTH: "l", NORM: "el"}]
_exc["als"] = [{ORTH: "a"}, {ORTH: "ls", NORM: "els"}]
_exc["pel"] = [{ORTH: "p", NORM: "per"}, {ORTH: "el"}]
_exc["pels"] = [{ORTH: "p", NORM: "per"}, {ORTH: "els"}]
_exc["holahola"] = [{ORTH: "holahola", NORM: "cocacola"}]
# Times # Times
_exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", NORM: "p.m."}] _exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", NORM: "p.m."}]

View File

@ -11,7 +11,18 @@ def test_ca_tokenizer_handles_abbr(ca_tokenizer, text, lemma):
def test_ca_tokenizer_handles_exc_in_text(ca_tokenizer): def test_ca_tokenizer_handles_exc_in_text(ca_tokenizer):
text = "La Núria i el Pere han vingut aprox. a les 7 de la tarda." text = "La Dra. Puig viu a la pl. dels Til·lers."
tokens = ca_tokenizer(text) doc = ca_tokenizer(text)
assert len(tokens) == 15 assert [t.text for t in doc] == [
assert tokens[7].text == "aprox." "La",
"Dra.",
"Puig",
"viu",
"a",
"la",
"pl.",
"d",
"els",
"Til·lers",
".",
]

View File

@ -2,7 +2,14 @@ import pytest
@pytest.mark.parametrize( @pytest.mark.parametrize(
"text,expected_tokens", [("d'un", ["d'", "un"]), ("s'ha", ["s'", "ha"])] "text,expected_tokens",
[
("d'un", ["d'", "un"]),
("s'ha", ["s'", "ha"]),
("del", ["d", "el"]),
("cantar-te", ["cantar", "-te"]),
("-hola", ["-", "hola"]),
],
) )
def test_contractions(ca_tokenizer, text, expected_tokens): def test_contractions(ca_tokenizer, text, expected_tokens):
"""Test that the contractions are split into two tokens""" """Test that the contractions are split into two tokens"""

View File

@ -12,17 +12,20 @@ def test_ca_tokenizer_handles_long_text(ca_tokenizer):
una gerra de cervesa. Ens asseiem -fotògraf i periodista- en una terrassa buida.""" una gerra de cervesa. Ens asseiem -fotògraf i periodista- en una terrassa buida."""
tokens = ca_tokenizer(text) tokens = ca_tokenizer(text)
assert len(tokens) == 140 assert len(tokens) == 146
@pytest.mark.parametrize( @pytest.mark.parametrize(
"text,length", "text,length",
[ [
("Perquè va anar-hi?", 4), ("Perquè va anar-hi?", 5),
("El cotxe dels veins.", 6),
("“Ah no?”", 5), ("“Ah no?”", 5),
("""Sí! "Anem", va contestar el Joan Carles""", 11), ("""Sí! "Anem", va contestar el Joan Carles""", 11),
("Van córrer aprox. 10km", 5), ("Van córrer aprox. 10km", 5),
("Llavors perqué...", 3), ("Llavors perqué...", 3),
("Vull parlar-te'n demà al matí", 8),
("Vull explicar-t'ho demà al matí", 8),
], ],
) )
def test_ca_tokenizer_handles_cnts(ca_tokenizer, text, length): def test_ca_tokenizer_handles_cnts(ca_tokenizer, text, length):