From 731b40cf51b97fe27cda4a07ec61dd6b88a5cf31 Mon Sep 17 00:00:00 2001 From: Matthew Alexander Hernandez Date: Sat, 29 Nov 2025 02:03:01 -0700 Subject: [PATCH] Enhance lex_attrs for Spanish & Portuguese --- .github/contributors/weezymatt.md | 106 ++++++++++++++++++++++++++++++ spacy/lang/es/lex_attrs.py | 40 ++++++++++- spacy/lang/pt/lex_attrs.py | 53 +++++++++++++-- spacy/tests/lang/es/test_text.py | 17 +++-- spacy/tests/lang/pt/test_text.py | 31 +++++++++ 5 files changed, 235 insertions(+), 12 deletions(-) create mode 100644 .github/contributors/weezymatt.md diff --git a/.github/contributors/weezymatt.md b/.github/contributors/weezymatt.md new file mode 100644 index 000000000..ca7091507 --- /dev/null +++ b/.github/contributors/weezymatt.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Matthew A. Hernandez | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 11/29/2025 | +| GitHub username | weezygeezer | +| Website (optional) | | diff --git a/spacy/lang/es/lex_attrs.py b/spacy/lang/es/lex_attrs.py index 4c477eaee..09f618c88 100644 --- a/spacy/lang/es/lex_attrs.py +++ b/spacy/lang/es/lex_attrs.py @@ -45,38 +45,71 @@ _num_words = [ "trillón", ] - +# Masculine, feminine, and apocopation forms _ordinal_words = [ + "primer", "primero", + "primera", "segundo", + "segunda", + "tercer", "tercero", + "tercera", "cuarto", + "cuarta", "quinto", + "quinta", "sexto", + "sexta", "séptimo", + "séptima", "octavo", + "octava", "noveno", + "novena", "décimo", + "décima", "undécimo", + "undécima", "duodécimo", + "duodécima", "decimotercero", + "decimotercera", "decimocuarto", + "decimocuarta", "decimoquinto", + "decimoquinta", "decimosexto", + "decimosexta", "decimoséptimo", + "decimoséptima", "decimoctavo", + "decimoctava", "decimonoveno", + "decimonovena", "vigésimo", + "vigésima", "trigésimo", + "trigésima", "cuadragésimo", + "cuadragésima", "quincuagésimo", + "quincuagésima", "sexagésimo", + "sexagésima", "septuagésimo", + "septuagésima", + "octogésimo", "octogésima", + "nonagésimo", "nonagésima", + "centésimo", "centésima", + "milésimo", "milésima", + "millonésimo", "millonésima", + "billonésimo", "billonésima", ] @@ -84,7 +117,7 @@ _ordinal_words = [ def like_num(text): if text.startswith(("+", "-", "±", "~")): text = text[1:] - text = text.replace(",", "").replace(".", "") + text = text.replace(",", "").replace(".", "").replace("º", "").replace("ª", "") if text.isdigit(): return True if text.count("/") == 1: @@ -97,6 +130,9 @@ def like_num(text): # Check ordinal number if text_lower in _ordinal_words: return True + # Check plural ordinal number + if text_lower[:-1] in _ordinal_words and text_lower.endswith("s"): + return True return False diff --git a/spacy/lang/pt/lex_attrs.py b/spacy/lang/pt/lex_attrs.py index de6a67f14..b3d98c9f1 100644 --- a/spacy/lang/pt/lex_attrs.py +++ b/spacy/lang/pt/lex_attrs.py @@ -3,7 +3,9 @@ from ...attrs import LIKE_NUM _num_words = [ "zero", "um", + "uma", "dois", + "duas", "três", "tres", "quatro", @@ -37,13 +39,21 @@ _num_words = [ "cem", "cento", "duzentos", + "duzentas", "trezentos", + "trezentas", "quatrocentos", + "quatrocentas", "quinhentos", - "seicentos", + "quinhentas", + "seiscentos", + "seiscentas", "setecentos", + "setecentas", "oitocentos", + "oitocentas", "novecentos", + "novecentas", "mil", "milhão", "milhao", @@ -63,38 +73,68 @@ _num_words = [ "quadrilhoes", ] - +# Masculine and feminine forms _ordinal_words = [ "primeiro", + "primeira", "segundo", + "segunda", "terceiro", + "terceira", "quarto", + "quarta", "quinto", + "quinta", "sexto", + "sexta", "sétimo", + "séptima", "oitavo", + "oitava", "nono", + "nona", "décimo", + "décima", "vigésimo", + "vigésima", "trigésimo", + "trigésima", "quadragésimo", + "quadragésima", "quinquagésimo", + "quinquagésima", "sexagésimo", + "sexagésima", "septuagésimo", + "septuagésima", "octogésimo", + "octogésima", "nonagésimo", + "nonagésima", "centésimo", + "centésima", "ducentésimo", + "ducentésima", "trecentésimo", + "trecentésima", "quadringentésimo", + "quadringentésima", "quingentésimo", + "quingentésima", "sexcentésimo", + "sexcentésima", "septingentésimo", + "septingentésima", "octingentésimo", + "octingentésima", "nongentésimo", + "nongentésima", "milésimo", + "milésima", "milionésimo", + "milionésima", "bilionésimo", + "bilionésima", ] @@ -108,9 +148,14 @@ def like_num(text): num, denom = text.split("/") if num.isdigit() and denom.isdigit(): return True - if text.lower() in _num_words: + text_lower = text.lower() + if text_lower in _num_words: return True - if text.lower() in _ordinal_words: + # Check ordinal number + if text_lower in _ordinal_words: + return True + # Check plural ordinal number + if text_lower[:-1] in _ordinal_words and text_lower.endswith("s"): return True return False diff --git a/spacy/tests/lang/es/test_text.py b/spacy/tests/lang/es/test_text.py index 1d1f7fa6b..1a7614957 100644 --- a/spacy/tests/lang/es/test_text.py +++ b/spacy/tests/lang/es/test_text.py @@ -1,15 +1,13 @@ import pytest -from spacy.lang.es import Spanish from spacy.lang.es.lex_attrs import like_num @pytest.mark.issue(3803) -def test_issue3803(): +@pytest.mark.parametrize("text", ["2 dos 1000 mil 12 doce"]) +def test_issue3803(es_tokenizer, text): """Test that spanish num-like tokens have True for like_num attribute.""" - nlp = Spanish() - text = "2 dos 1000 mil 12 doce" - doc = nlp(text) + doc = es_tokenizer(text) assert [t.like_num for t in doc] == [True, True, True, True, True, True] @@ -62,12 +60,19 @@ def test_es_tokenizer_handles_cnts(es_tokenizer, text, length): ("1/2", True), ], ) -def test_lex_attrs_like_number(es_tokenizer, text, match): +def test_es_lex_attrs_like_number(es_tokenizer, text, match): tokens = es_tokenizer(text) assert len(tokens) == 1 assert tokens[0].like_num == match +@pytest.mark.parametrize( + "word", ["tercero", "décimos", "Millonésimo", "100.º", "Centésima", "9ª", "primer"] +) +def test_es_lex_attrs_like_number_for_ordinal(word): + assert like_num(word) + + @pytest.mark.parametrize("word", ["once"]) def test_es_lex_attrs_capitals(word): assert like_num(word) diff --git a/spacy/tests/lang/pt/test_text.py b/spacy/tests/lang/pt/test_text.py index cb8723901..233b0b773 100644 --- a/spacy/tests/lang/pt/test_text.py +++ b/spacy/tests/lang/pt/test_text.py @@ -3,6 +3,37 @@ import pytest from spacy.lang.pt.lex_attrs import like_num +@pytest.mark.parametrize( + "text,match", + [ + ("10", True), + ("1", True), + ("10,000", True), + ("10,00", True), + ("999.0", True), + ("um", True), + ("dois", True), + ("bilhão", True), + ("vinte", True), + ("cachorro", False), + (",", False), + ("1/2", True), + ("duas", True), + ], +) +def test_pt_lex_attrs_like_number(pt_tokenizer, text, match): + tokens = pt_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].like_num == match + + +@pytest.mark.parametrize( + "word", ["terceiro", "décimos", "Milionésimo", "100.º", "Centésimo", "9.ª"] +) +def test_pt_lex_attrs_like_number_for_ordinal(word): + assert like_num(word) + + @pytest.mark.parametrize("word", ["onze", "quadragésimo"]) def test_pt_lex_attrs_capitals(word): assert like_num(word)