Merge 731b40cf51 into c1e7cb2ebf

2026-01-09 18:21:14 +03:00 · 2025-12-01 15:43:58 -07:00 · 2025-12-01 15:43:58 -07:00 · d3760156f4
commit d3760156f4
parent c1e7cb2ebf 731b40cf51
5 changed files with 235 additions and 12 deletions
--- a/.github/contributors/weezymatt.md
+++ b/.github/contributors/weezymatt.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Matthew A. Hernandez |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 11/29/2025           |
+| GitHub username                | weezygeezer          |
+| Website (optional)             |                      |
--- a/spacy/lang/es/lex_attrs.py
+++ b/spacy/lang/es/lex_attrs.py
@ -45,38 +45,71 @@ _num_words = [
    "trillón",
 ]

-
+# Masculine, feminine, and apocopation forms
 _ordinal_words = [
+    "primer",
    "primero",
+    "primera",
    "segundo",
+    "segunda",
+    "tercer",
    "tercero",
+    "tercera",
    "cuarto",
+    "cuarta",
    "quinto",
+    "quinta",
    "sexto",
+    "sexta",
    "séptimo",
+    "séptima",
    "octavo",
+    "octava",
    "noveno",
+    "novena",
    "décimo",
+    "décima",
    "undécimo",
+    "undécima",
    "duodécimo",
+    "duodécima",
    "decimotercero",
+    "decimotercera",
    "decimocuarto",
+    "decimocuarta",
    "decimoquinto",
+    "decimoquinta",
    "decimosexto",
+    "decimosexta",
    "decimoséptimo",
+    "decimoséptima",
    "decimoctavo",
+    "decimoctava",
    "decimonoveno",
+    "decimonovena",
    "vigésimo",
+    "vigésima",
    "trigésimo",
+    "trigésima",
    "cuadragésimo",
+    "cuadragésima",
    "quincuagésimo",
+    "quincuagésima",
    "sexagésimo",
+    "sexagésima",
    "septuagésimo",
+    "septuagésima",
+    "octogésimo",
    "octogésima",
+    "nonagésimo",
    "nonagésima",
+    "centésimo",
    "centésima",
+    "milésimo",
    "milésima",
+    "millonésimo",
    "millonésima",
+    "billonésimo",
    "billonésima",
 ]

@ -84,7 +117,7 @@ _ordinal_words = [
 def like_num(text):
    if text.startswith(("+", "-", "±", "~")):
        text = text[1:]
-    text = text.replace(",", "").replace(".", "")
+    text = text.replace(",", "").replace(".", "").replace("º", "").replace("ª", "")
    if text.isdigit():
        return True
    if text.count("/") == 1:
@ -97,6 +130,9 @@ def like_num(text):
    # Check ordinal number
    if text_lower in _ordinal_words:
        return True
+    # Check plural ordinal number
+    if text_lower[:-1] in _ordinal_words and text_lower.endswith("s"):
+        return True
    return False


--- a/spacy/lang/pt/lex_attrs.py
+++ b/spacy/lang/pt/lex_attrs.py
@ -3,7 +3,9 @@ from ...attrs import LIKE_NUM
 _num_words = [
    "zero",
    "um",
+    "uma",
    "dois",
+    "duas",
    "três",
    "tres",
    "quatro",
@ -37,13 +39,21 @@ _num_words = [
    "cem",
    "cento",
    "duzentos",
+    "duzentas",
    "trezentos",
+    "trezentas",
    "quatrocentos",
+    "quatrocentas",
    "quinhentos",
-    "seicentos",
+    "quinhentas",
+    "seiscentos",
+    "seiscentas",
    "setecentos",
+    "setecentas",
    "oitocentos",
+    "oitocentas",
    "novecentos",
+    "novecentas",
    "mil",
    "milhão",
    "milhao",
@ -63,38 +73,68 @@ _num_words = [
    "quadrilhoes",
 ]

-
+# Masculine and feminine forms
 _ordinal_words = [
    "primeiro",
+    "primeira",
    "segundo",
+    "segunda",
    "terceiro",
+    "terceira",
    "quarto",
+    "quarta",
    "quinto",
+    "quinta",
    "sexto",
+    "sexta",
    "sétimo",
+    "séptima",
    "oitavo",
+    "oitava",
    "nono",
+    "nona",
    "décimo",
+    "décima",
    "vigésimo",
+    "vigésima",
    "trigésimo",
+    "trigésima",
    "quadragésimo",
+    "quadragésima",
    "quinquagésimo",
+    "quinquagésima",
    "sexagésimo",
+    "sexagésima",
    "septuagésimo",
+    "septuagésima",
    "octogésimo",
+    "octogésima",
    "nonagésimo",
+    "nonagésima",
    "centésimo",
+    "centésima",
    "ducentésimo",
+    "ducentésima",
    "trecentésimo",
+    "trecentésima",
    "quadringentésimo",
+    "quadringentésima",
    "quingentésimo",
+    "quingentésima",
    "sexcentésimo",
+    "sexcentésima",
    "septingentésimo",
+    "septingentésima",
    "octingentésimo",
+    "octingentésima",
    "nongentésimo",
+    "nongentésima",
    "milésimo",
+    "milésima",
    "milionésimo",
+    "milionésima",
    "bilionésimo",
+    "bilionésima",
 ]


@ -108,9 +148,14 @@ def like_num(text):
        num, denom = text.split("/")
        if num.isdigit() and denom.isdigit():
            return True
-    if text.lower() in _num_words:
+    text_lower = text.lower()
+    if text_lower in _num_words:
        return True
-    if text.lower() in _ordinal_words:
+    # Check ordinal number
+    if text_lower in _ordinal_words:
+        return True
+    # Check plural ordinal number
+    if text_lower[:-1] in _ordinal_words and text_lower.endswith("s"):
        return True
    return False

--- a/spacy/tests/lang/es/test_text.py
+++ b/spacy/tests/lang/es/test_text.py
@ -1,15 +1,13 @@
 import pytest

-from spacy.lang.es import Spanish
 from spacy.lang.es.lex_attrs import like_num


@pytest.mark.issue(3803)
-def test_issue3803():
+@pytest.mark.parametrize("text", ["2 dos 1000 mil 12 doce"])
+def test_issue3803(es_tokenizer, text):
    """Test that spanish num-like tokens have True for like_num attribute."""
-    nlp = Spanish()
-    text = "2 dos 1000 mil 12 doce"
-    doc = nlp(text)
+    doc = es_tokenizer(text)

    assert [t.like_num for t in doc] == [True, True, True, True, True, True]

@ -62,12 +60,19 @@ def test_es_tokenizer_handles_cnts(es_tokenizer, text, length):
        ("1/2", True),
    ],
 )
-def test_lex_attrs_like_number(es_tokenizer, text, match):
+def test_es_lex_attrs_like_number(es_tokenizer, text, match):
    tokens = es_tokenizer(text)
    assert len(tokens) == 1
    assert tokens[0].like_num == match


+@pytest.mark.parametrize(
+    "word", ["tercero", "décimos", "Millonésimo", "100.º", "Centésima", "9ª", "primer"]
+)
+def test_es_lex_attrs_like_number_for_ordinal(word):
+    assert like_num(word)
+
+
@pytest.mark.parametrize("word", ["once"])
 def test_es_lex_attrs_capitals(word):
    assert like_num(word)
--- a/spacy/tests/lang/pt/test_text.py
+++ b/spacy/tests/lang/pt/test_text.py
@ -3,6 +3,37 @@ import pytest
 from spacy.lang.pt.lex_attrs import like_num


+@pytest.mark.parametrize(
+    "text,match",
+    [
+        ("10", True),
+        ("1", True),
+        ("10,000", True),
+        ("10,00", True),
+        ("999.0", True),
+        ("um", True),
+        ("dois", True),
+        ("bilhão", True),
+        ("vinte", True),
+        ("cachorro", False),
+        (",", False),
+        ("1/2", True),
+        ("duas", True),
+    ],
+)
+def test_pt_lex_attrs_like_number(pt_tokenizer, text, match):
+    tokens = pt_tokenizer(text)
+    assert len(tokens) == 1
+    assert tokens[0].like_num == match
+
+
+@pytest.mark.parametrize(
+    "word", ["terceiro", "décimos", "Milionésimo", "100.º", "Centésimo", "9.ª"]
+)
+def test_pt_lex_attrs_like_number_for_ordinal(word):
+    assert like_num(word)
+
+
@pytest.mark.parametrize("word", ["onze", "quadragésimo"])
 def test_pt_lex_attrs_capitals(word):
    assert like_num(word)