From 148b036e0cae9eebb6968cea5ecede1ebc7205a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samuel=20Rodr=C3=ADguez=20Medina?= Date: Thu, 30 Apr 2020 11:13:23 +0200 Subject: [PATCH] Spanish like num improvement (#5381) * Add tests for Spanish like_num. * Add missing numbers in Spanish lexical attributes for like_num. * Modify Spanish test function name. * Add contributor agreement. --- spacy/lang/es/lex_attrs.py | 9 +++++++++ spacy/tests/lang/es/test_text.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/spacy/lang/es/lex_attrs.py b/spacy/lang/es/lex_attrs.py index 03ada1f43..632a638fc 100644 --- a/spacy/lang/es/lex_attrs.py +++ b/spacy/lang/es/lex_attrs.py @@ -26,6 +26,15 @@ _num_words = [ "dieciocho", "diecinueve", "veinte", + "veintiuno", + "veintidós", + "veintitrés", + "veinticuatro", + "veinticinco", + "veintiséis", + "veintisiete", + "veintiocho", + "veintinueve", "treinta", "cuarenta", "cincuenta", diff --git a/spacy/tests/lang/es/test_text.py b/spacy/tests/lang/es/test_text.py index acd572b48..e237f922d 100644 --- a/spacy/tests/lang/es/test_text.py +++ b/spacy/tests/lang/es/test_text.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import pytest +from spacy.lang.es.lex_attrs import like_num def test_es_tokenizer_handles_long_text(es_tokenizer): @@ -33,3 +34,32 @@ en Montevideo y que pregona las bondades de la vida austera.""" def test_es_tokenizer_handles_cnts(es_tokenizer, text, length): tokens = es_tokenizer(text) assert len(tokens) == length + + +@pytest.mark.parametrize( + "text,match", + [ + ("10", True), + ("1", True), + ("10.000", True), + ("1000", True), + ("999,0", True), + ("uno", True), + ("dos", True), + ("billón", True), + ("veintiséis", True), + ("perro", False), + (",", False), + ("1/2", True), + ], +) +def test_lex_attrs_like_number(es_tokenizer, text, match): + tokens = es_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].like_num == match + + +@pytest.mark.parametrize("word", ["once"]) +def test_es_lex_attrs_capitals(word): + assert like_num(word) + assert like_num(word.upper()) \ No newline at end of file