From 731b40cf51b97fe27cda4a07ec61dd6b88a5cf31 Mon Sep 17 00:00:00 2001
From: Matthew Alexander Hernandez <mhernan0697@gmail.com>
Date: Sat, 29 Nov 2025 02:03:01 -0700
Subject: [PATCH] Enhance lex_attrs for Spanish & Portuguese

---
 .github/contributors/weezymatt.md | 106 ++++++++++++++++++++++++++++++
 spacy/lang/es/lex_attrs.py        |  40 ++++++++++-
 spacy/lang/pt/lex_attrs.py        |  53 +++++++++++++--
 spacy/tests/lang/es/test_text.py  |  17 +++--
 spacy/tests/lang/pt/test_text.py  |  31 +++++++++
 5 files changed, 235 insertions(+), 12 deletions(-)
 create mode 100644 .github/contributors/weezymatt.md

diff --git a/.github/contributors/weezymatt.md b/.github/contributors/weezymatt.md
new file mode 100644
index 000000000..ca7091507
--- /dev/null
+++ b/.github/contributors/weezymatt.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Matthew A. Hernandez |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 11/29/2025           |
+| GitHub username                | weezygeezer          |
+| Website (optional)             |                      |
diff --git a/spacy/lang/es/lex_attrs.py b/spacy/lang/es/lex_attrs.py
index 4c477eaee..09f618c88 100644
--- a/spacy/lang/es/lex_attrs.py
+++ b/spacy/lang/es/lex_attrs.py
@@ -45,38 +45,71 @@ _num_words = [
     "trillón",
 ]
 
-
+# Masculine, feminine, and apocopation forms
 _ordinal_words = [
+    "primer",
     "primero",
+    "primera",
     "segundo",
+    "segunda",
+    "tercer",
     "tercero",
+    "tercera",
     "cuarto",
+    "cuarta",
     "quinto",
+    "quinta",
     "sexto",
+    "sexta",
     "séptimo",
+    "séptima",
     "octavo",
+    "octava",
     "noveno",
+    "novena",
     "décimo",
+    "décima",
     "undécimo",
+    "undécima",
     "duodécimo",
+    "duodécima",
     "decimotercero",
+    "decimotercera",
     "decimocuarto",
+    "decimocuarta",
     "decimoquinto",
+    "decimoquinta",
     "decimosexto",
+    "decimosexta",
     "decimoséptimo",
+    "decimoséptima",
     "decimoctavo",
+    "decimoctava",
     "decimonoveno",
+    "decimonovena",
     "vigésimo",
+    "vigésima",
     "trigésimo",
+    "trigésima",
     "cuadragésimo",
+    "cuadragésima",
     "quincuagésimo",
+    "quincuagésima",
     "sexagésimo",
+    "sexagésima",
     "septuagésimo",
+    "septuagésima",
+    "octogésimo",
     "octogésima",
+    "nonagésimo",
     "nonagésima",
+    "centésimo",
     "centésima",
+    "milésimo",
     "milésima",
+    "millonésimo",
     "millonésima",
+    "billonésimo",
     "billonésima",
 ]
 
@@ -84,7 +117,7 @@ _ordinal_words = [
 def like_num(text):
     if text.startswith(("+", "-", "±", "~")):
         text = text[1:]
-    text = text.replace(",", "").replace(".", "")
+    text = text.replace(",", "").replace(".", "").replace("º", "").replace("ª", "")
     if text.isdigit():
         return True
     if text.count("/") == 1:
@@ -97,6 +130,9 @@ def like_num(text):
     # Check ordinal number
     if text_lower in _ordinal_words:
         return True
+    # Check plural ordinal number
+    if text_lower[:-1] in _ordinal_words and text_lower.endswith("s"):
+        return True
     return False
 
 
diff --git a/spacy/lang/pt/lex_attrs.py b/spacy/lang/pt/lex_attrs.py
index de6a67f14..b3d98c9f1 100644
--- a/spacy/lang/pt/lex_attrs.py
+++ b/spacy/lang/pt/lex_attrs.py
@@ -3,7 +3,9 @@ from ...attrs import LIKE_NUM
 _num_words = [
     "zero",
     "um",
+    "uma",
     "dois",
+    "duas",
     "três",
     "tres",
     "quatro",
@@ -37,13 +39,21 @@ _num_words = [
     "cem",
     "cento",
     "duzentos",
+    "duzentas",
     "trezentos",
+    "trezentas",
     "quatrocentos",
+    "quatrocentas",
     "quinhentos",
-    "seicentos",
+    "quinhentas",
+    "seiscentos",
+    "seiscentas",
     "setecentos",
+    "setecentas",
     "oitocentos",
+    "oitocentas",
     "novecentos",
+    "novecentas",
     "mil",
     "milhão",
     "milhao",
@@ -63,38 +73,68 @@ _num_words = [
     "quadrilhoes",
 ]
 
-
+# Masculine and feminine forms
 _ordinal_words = [
     "primeiro",
+    "primeira",
     "segundo",
+    "segunda",
     "terceiro",
+    "terceira",
     "quarto",
+    "quarta",
     "quinto",
+    "quinta",
     "sexto",
+    "sexta",
     "sétimo",
+    "séptima",
     "oitavo",
+    "oitava",
     "nono",
+    "nona",
     "décimo",
+    "décima",
     "vigésimo",
+    "vigésima",
     "trigésimo",
+    "trigésima",
     "quadragésimo",
+    "quadragésima",
     "quinquagésimo",
+    "quinquagésima",
     "sexagésimo",
+    "sexagésima",
     "septuagésimo",
+    "septuagésima",
     "octogésimo",
+    "octogésima",
     "nonagésimo",
+    "nonagésima",
     "centésimo",
+    "centésima",
     "ducentésimo",
+    "ducentésima",
     "trecentésimo",
+    "trecentésima",
     "quadringentésimo",
+    "quadringentésima",
     "quingentésimo",
+    "quingentésima",
     "sexcentésimo",
+    "sexcentésima",
     "septingentésimo",
+    "septingentésima",
     "octingentésimo",
+    "octingentésima",
     "nongentésimo",
+    "nongentésima",
     "milésimo",
+    "milésima",
     "milionésimo",
+    "milionésima",
     "bilionésimo",
+    "bilionésima",
 ]
 
 
@@ -108,9 +148,14 @@ def like_num(text):
         num, denom = text.split("/")
         if num.isdigit() and denom.isdigit():
             return True
-    if text.lower() in _num_words:
+    text_lower = text.lower()
+    if text_lower in _num_words:
         return True
-    if text.lower() in _ordinal_words:
+    # Check ordinal number
+    if text_lower in _ordinal_words:
+        return True
+    # Check plural ordinal number
+    if text_lower[:-1] in _ordinal_words and text_lower.endswith("s"):
         return True
     return False
 
diff --git a/spacy/tests/lang/es/test_text.py b/spacy/tests/lang/es/test_text.py
index 1d1f7fa6b..1a7614957 100644
--- a/spacy/tests/lang/es/test_text.py
+++ b/spacy/tests/lang/es/test_text.py
@@ -1,15 +1,13 @@
 import pytest
 
-from spacy.lang.es import Spanish
 from spacy.lang.es.lex_attrs import like_num
 
 
 @pytest.mark.issue(3803)
-def test_issue3803():
+@pytest.mark.parametrize("text", ["2 dos 1000 mil 12 doce"])
+def test_issue3803(es_tokenizer, text):
     """Test that spanish num-like tokens have True for like_num attribute."""
-    nlp = Spanish()
-    text = "2 dos 1000 mil 12 doce"
-    doc = nlp(text)
+    doc = es_tokenizer(text)
 
     assert [t.like_num for t in doc] == [True, True, True, True, True, True]
 
@@ -62,12 +60,19 @@ def test_es_tokenizer_handles_cnts(es_tokenizer, text, length):
         ("1/2", True),
     ],
 )
-def test_lex_attrs_like_number(es_tokenizer, text, match):
+def test_es_lex_attrs_like_number(es_tokenizer, text, match):
     tokens = es_tokenizer(text)
     assert len(tokens) == 1
     assert tokens[0].like_num == match
 
 
+@pytest.mark.parametrize(
+    "word", ["tercero", "décimos", "Millonésimo", "100.º", "Centésima", "9ª", "primer"]
+)
+def test_es_lex_attrs_like_number_for_ordinal(word):
+    assert like_num(word)
+
+
 @pytest.mark.parametrize("word", ["once"])
 def test_es_lex_attrs_capitals(word):
     assert like_num(word)
diff --git a/spacy/tests/lang/pt/test_text.py b/spacy/tests/lang/pt/test_text.py
index cb8723901..233b0b773 100644
--- a/spacy/tests/lang/pt/test_text.py
+++ b/spacy/tests/lang/pt/test_text.py
@@ -3,6 +3,37 @@ import pytest
 from spacy.lang.pt.lex_attrs import like_num
 
 
+@pytest.mark.parametrize(
+    "text,match",
+    [
+        ("10", True),
+        ("1", True),
+        ("10,000", True),
+        ("10,00", True),
+        ("999.0", True),
+        ("um", True),
+        ("dois", True),
+        ("bilhão", True),
+        ("vinte", True),
+        ("cachorro", False),
+        (",", False),
+        ("1/2", True),
+        ("duas", True),
+    ],
+)
+def test_pt_lex_attrs_like_number(pt_tokenizer, text, match):
+    tokens = pt_tokenizer(text)
+    assert len(tokens) == 1
+    assert tokens[0].like_num == match
+
+
+@pytest.mark.parametrize(
+    "word", ["terceiro", "décimos", "Milionésimo", "100.º", "Centésimo", "9.ª"]
+)
+def test_pt_lex_attrs_like_number_for_ordinal(word):
+    assert like_num(word)
+
+
 @pytest.mark.parametrize("word", ["onze", "quadragésimo"])
 def test_pt_lex_attrs_capitals(word):
     assert like_num(word)