Merge branch 'master' into fix/enum-python-types

2025-08-03 03:40:24 +03:00 · 2025-05-28 17:26:47 +02:00 · 2025-05-28 17:26:47 +02:00 · 79f9d3ea2a
commit 79f9d3ea2a
parent 2567266bf7 41e07772dc
28 changed files with 928 additions and 71 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -17,7 +17,6 @@ requests>=2.13.0,<3.0.0
 tqdm>=4.38.0,<5.0.0
 pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
 jinja2
-langcodes>=3.2.0,<4.0.0
 # Official Python utilities
 setuptools
 packaging>=20.0
--- a/setup.cfg
+++ b/setup.cfg
@ -65,7 +65,6 @@ install_requires =
    # Official Python utilities
    setuptools
    packaging>=20.0
-    langcodes>=3.2.0,<4.0.0

 [options.entry_points]
 console_scripts =
--- a/spacy/lang/ht/init.py
+++ b/spacy/lang/ht/init.py
@ -0,0 +1,52 @@
+from typing import Callable, Optional
+
+from thinc.api import Model
+
+from ...language import BaseDefaults, Language
+from .lemmatizer import HaitianCreoleLemmatizer
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from .tag_map import TAG_MAP
+
+
+class HaitianCreoleDefaults(BaseDefaults):
+    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+    prefixes = TOKENIZER_PREFIXES
+    infixes = TOKENIZER_INFIXES
+    suffixes = TOKENIZER_SUFFIXES
+    lex_attr_getters = LEX_ATTRS
+    syntax_iterators = SYNTAX_ITERATORS
+    stop_words = STOP_WORDS
+    tag_map = TAG_MAP
+
+class HaitianCreole(Language):
+    lang = "ht"
+    Defaults = HaitianCreoleDefaults
+
+@HaitianCreole.factory(
+    "lemmatizer",
+    assigns=["token.lemma"],
+    default_config={
+        "model": None,
+        "mode": "rule",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
+    default_score_weights={"lemma_acc": 1.0},
+)
+def make_lemmatizer(
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool,
+    scorer: Optional[Callable],
+):
+    return HaitianCreoleLemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )
+
+__all__ = ["HaitianCreole"]
--- a/spacy/lang/ht/examples.py
+++ b/spacy/lang/ht/examples.py
@ -0,0 +1,18 @@
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.ht.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple ap panse achte yon demaraj nan Wayòm Ini pou $1 milya dola",
+    "Machin otonòm fè responsablite asirans lan ale sou men fabrikan yo",
+    "San Francisco ap konsidere entèdi robo ki livre sou twotwa yo",
+    "Lond se yon gwo vil nan Wayòm Ini",
+    "Kote ou ye?",
+    "Kilès ki prezidan Lafrans?",
+    "Ki kapital Etazini?",
+    "Kile Barack Obama te fèt?",
+]
--- a/spacy/lang/ht/lemmatizer.py
+++ b/spacy/lang/ht/lemmatizer.py
@ -0,0 +1,51 @@
+from typing import List, Tuple
+
+from ...pipeline import Lemmatizer
+from ...tokens import Token
+from ...lookups import Lookups
+
+
+class HaitianCreoleLemmatizer(Lemmatizer):
+    """
+    Minimal Haitian Creole lemmatizer.
+    Returns a word's base form based on rules and lookup,
+    or defaults to the original form.
+    """
+
+    def is_base_form(self, token: Token) -> bool:
+        morph = token.morph.to_dict()
+        upos = token.pos_.lower()
+
+        # Consider unmarked forms to be base
+        if upos in {"noun", "verb", "adj", "adv"}:
+            if not morph:
+                return True
+            if upos == "noun" and morph.get("Number") == "Sing":
+                return True
+            if upos == "verb" and morph.get("VerbForm") == "Inf":
+                return True
+            if upos == "adj" and morph.get("Degree") == "Pos":
+                return True
+        return False
+
+    def rule_lemmatize(self, token: Token) -> List[str]:
+        string = token.text.lower()
+        pos = token.pos_.lower()
+        cache_key = (token.orth, token.pos)
+        if cache_key in self.cache:
+            return self.cache[cache_key]
+
+        forms = []
+
+        # fallback rule: just return lowercased form
+        forms.append(string)
+
+        self.cache[cache_key] = forms
+        return forms
+
+    @classmethod
+    def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
+        if mode == "rule":
+            required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
+            return (required, [])
+        return super().get_lookups_config(mode)
--- a/spacy/lang/ht/lex_attrs.py
+++ b/spacy/lang/ht/lex_attrs.py
@ -0,0 +1,78 @@
+from ...attrs import LIKE_NUM, NORM
+
+# Cardinal numbers in Creole
+_num_words = set(
+    """
+zewo youn en de twa kat senk sis sèt uit nèf dis
+onz douz trèz katoz kenz sèz disèt dizwit diznèf
+vent trant karant sinkant swasant swasann-dis
+san mil milyon milya
+""".split()
+)
+
+# Ordinal numbers in Creole (some are French-influenced, some simplified)
+_ordinal_words = set(
+    """
+premye dezyèm twazyèm katryèm senkyèm sizyèm sètvyèm uitvyèm nèvyèm dizyèm
+onzèm douzyèm trèzyèm katozyèm kenzèm sèzyèm disetyèm dizwityèm diznèvyèm
+ventyèm trantyèm karantyèm sinkantyèm swasantyèm
+swasann-disyèm santyèm milyèm milyonnyèm milyadyèm
+""".split()
+)
+
+NORM_MAP = {
+    "'m": "mwen",
+    "'w": "ou",
+    "'l": "li",
+    "'n": "nou",
+    "'y": "yo",
+    "’m": "mwen",
+    "’w": "ou",
+    "’l": "li",
+    "’n": "nou",
+    "’y": "yo",
+    "m": "mwen",
+    "n": "nou",
+    "l": "li",
+    "y": "yo",
+    "w": "ou",
+    "t": "te",
+    "k": "ki",
+    "p": "pa",
+    "M": "Mwen",
+    "N": "Nou",
+    "L": "Li",
+    "Y": "Yo",
+    "W": "Ou",
+    "T": "Te",
+    "K": "Ki",
+    "P": "Pa",
+}
+
+def like_num(text):
+    text = text.strip().lower()
+    if text.startswith(("+", "-", "±", "~")):
+        text = text[1:]
+    text = text.replace(",", "").replace(".", "")
+    if text.isdigit():
+        return True
+    if text.count("/") == 1:
+        num, denom = text.split("/")
+        if num.isdigit() and denom.isdigit():
+            return True
+    if text in _num_words:
+        return True
+    if text in _ordinal_words:
+        return True
+    # Handle things like "3yèm", "10yèm", "25yèm", etc.
+    if text.endswith("yèm") and text[:-3].isdigit():
+        return True
+    return False
+
+def norm_custom(text):
+    return NORM_MAP.get(text, text.lower())
+
+LEX_ATTRS = {
+    LIKE_NUM: like_num,
+    NORM: norm_custom,
+}
--- a/spacy/lang/ht/punctuation.py
+++ b/spacy/lang/ht/punctuation.py
@ -0,0 +1,43 @@
+from ..char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    ALPHA_UPPER,
+    CONCAT_QUOTES,
+    HYPHENS,
+    LIST_PUNCT,
+    LIST_QUOTES,
+    LIST_ELLIPSES,
+    LIST_ICONS,
+    merge_chars,
+)
+
+ELISION = "'’".replace(" ", "")
+
+_prefixes_elision = "m n l y t k w"
+_prefixes_elision += " " + _prefixes_elision.upper()
+
+TOKENIZER_PREFIXES = LIST_PUNCT + LIST_QUOTES + [
+    r"(?:({pe})[{el}])(?=[{a}])".format(
+        a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
+    )
+]
+
+TOKENIZER_SUFFIXES = LIST_PUNCT + LIST_QUOTES + LIST_ELLIPSES + [
+    r"(?<=[0-9])%",  # numbers like 10%
+    r"(?<=[0-9])(?:{h})".format(h=HYPHENS),  # hyphens after numbers
+    r"(?<=[{a}])['’]".format(a=ALPHA),  # apostrophes after letters
+    r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA),  # contractions
+    r"(?<=[{a}0-9])\)",  # right parenthesis after letter/number
+    r"(?<=[{a}])\.(?=\s|$)".format(a=ALPHA),  # period after letter if space or end of string
+    r"(?<=\))[\.\?!]",  # punctuation immediately after right parenthesis
+]
+
+TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [
+    r"(?<=[0-9])[+\-\*^](?=[0-9-])",
+    r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
+        al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
+    ),
+    r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+    r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
+    r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
+]
--- a/spacy/lang/ht/stop_words.py
+++ b/spacy/lang/ht/stop_words.py
@ -0,0 +1,50 @@
+STOP_WORDS = set(
+    """
+a ak an ankò ant apre ap atò avan avanlè
+byen bò byenke
+
+chak
+
+de depi deja deja
+
+e en epi èske
+
+fò fòk
+
+gen genyen
+
+ki kisa kilès kote koukou konsa konbyen konn konnen kounye kouman
+
+la l laa le lè li lye lò
+
+m m' mwen
+
+nan nap nou n'
+
+ou oumenm
+
+pa paske pami pandan pito pou pral preske pwiske
+
+se selman si sou sòt
+
+ta tap tankou te toujou tou tan tout toutotan twòp tèl
+
+w w' wi wè
+
+y y' yo yon yonn
+
+non o oh eh
+
+sa san si swa si
+
+men mèsi oswa osinon
+
+"""
+.split()
+)
+
+# Add common contractions, with and without apostrophe variants
+contractions = ["m'", "n'", "w'", "y'", "l'", "t'", "k'"]
+for apostrophe in ["'", "’", "‘"]:
+    for word in contractions:
+        STOP_WORDS.add(word.replace("'", apostrophe))
--- a/spacy/lang/ht/syntax_iterators.py
+++ b/spacy/lang/ht/syntax_iterators.py
@ -0,0 +1,74 @@
+from typing import Iterator, Tuple, Union
+
+from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
+from ...tokens import Doc, Span
+
+
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
+    """
+    Detect base noun phrases from a dependency parse for Haitian Creole.
+    Works on both Doc and Span objects.
+    """
+
+    # Core nominal dependencies common in Haitian Creole
+    labels = [
+        "nsubj",
+        "obj",
+        "obl",
+        "nmod",
+        "appos",
+        "ROOT",
+    ]
+
+    # Modifiers to optionally include in chunk (to the right)
+    post_modifiers = ["compound", "flat", "flat:name", "fixed"]
+
+    doc = doclike.doc
+    if not doc.has_annotation("DEP"):
+        raise ValueError(Errors.E029)
+
+    np_deps = {doc.vocab.strings.add(label) for label in labels}
+    np_mods = {doc.vocab.strings.add(mod) for mod in post_modifiers}
+    conj_label = doc.vocab.strings.add("conj")
+    np_label = doc.vocab.strings.add("NP")
+    adp_pos = doc.vocab.strings.add("ADP")
+    cc_pos = doc.vocab.strings.add("CCONJ")
+
+    prev_end = -1
+    for i, word in enumerate(doclike):
+        if word.pos not in (NOUN, PROPN, PRON):
+            continue
+        if word.left_edge.i <= prev_end:
+            continue
+
+        if word.dep in np_deps:
+            right_end = word
+            # expand to include known modifiers to the right
+            for child in word.rights:
+                if child.dep in np_mods:
+                    right_end = child.right_edge
+                elif child.pos == NOUN:
+                    right_end = child.right_edge
+
+            left_index = word.left_edge.i
+            # Skip prepositions at the start
+            if word.left_edge.pos == adp_pos:
+                left_index += 1
+
+            prev_end = right_end.i
+            yield left_index, right_end.i + 1, np_label
+
+        elif word.dep == conj_label:
+            head = word.head
+            while head.dep == conj_label and head.head.i < head.i:
+                head = head.head
+            if head.dep in np_deps:
+                left_index = word.left_edge.i
+                if word.left_edge.pos == cc_pos:
+                    left_index += 1
+                prev_end = word.i
+                yield left_index, word.i + 1, np_label
+
+
+SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
--- a/spacy/lang/ht/tag_map.py
+++ b/spacy/lang/ht/tag_map.py
@ -0,0 +1,21 @@
+from spacy.symbols import NOUN, VERB, AUX, ADJ, ADV, PRON, DET, ADP, SCONJ, CCONJ, PART, INTJ, NUM, PROPN, PUNCT, SYM, X
+
+TAG_MAP = {
+    "NOUN": {"pos": NOUN},
+    "VERB": {"pos": VERB},
+    "AUX": {"pos": AUX},
+    "ADJ": {"pos": ADJ},
+    "ADV": {"pos": ADV},
+    "PRON": {"pos": PRON},
+    "DET": {"pos": DET},
+    "ADP": {"pos": ADP},
+    "SCONJ": {"pos": SCONJ},
+    "CCONJ": {"pos": CCONJ},
+    "PART": {"pos": PART},
+    "INTJ": {"pos": INTJ},
+    "NUM": {"pos": NUM},
+    "PROPN": {"pos": PROPN},
+    "PUNCT": {"pos": PUNCT},
+    "SYM": {"pos": SYM},
+    "X": {"pos": X},
+}
--- a/spacy/lang/ht/tokenizer_exceptions.py
+++ b/spacy/lang/ht/tokenizer_exceptions.py
@ -0,0 +1,121 @@
+from spacy.symbols import ORTH, NORM
+
+def make_variants(base, first_norm, second_orth, second_norm):
+    return {
+        base: [
+            {ORTH: base.split("'")[0] + "'", NORM: first_norm},
+            {ORTH: second_orth, NORM: second_norm},
+        ],
+        base.capitalize(): [
+            {ORTH: base.split("'")[0].capitalize() + "'", NORM: first_norm.capitalize()},
+            {ORTH: second_orth, NORM: second_norm},
+        ]
+    }
+
+TOKENIZER_EXCEPTIONS = {
+    "Dr.": [{ORTH: "Dr."}]
+}
+
+# Apostrophe forms
+TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap"))
+TOKENIZER_EXCEPTIONS.update(make_variants("n'ap", "nou", "ap", "ap"))
+TOKENIZER_EXCEPTIONS.update(make_variants("l'ap", "li", "ap", "ap"))
+TOKENIZER_EXCEPTIONS.update(make_variants("y'ap", "yo", "ap", "ap"))
+TOKENIZER_EXCEPTIONS.update(make_variants("m'te", "mwen", "te", "te"))
+TOKENIZER_EXCEPTIONS.update(make_variants("m'pral", "mwen", "pral", "pral"))
+TOKENIZER_EXCEPTIONS.update(make_variants("w'ap", "ou", "ap", "ap"))
+TOKENIZER_EXCEPTIONS.update(make_variants("k'ap", "ki", "ap", "ap"))
+TOKENIZER_EXCEPTIONS.update(make_variants("p'ap", "pa", "ap", "ap"))
+TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap"))
+
+# Non-apostrophe contractions (with capitalized variants)
+TOKENIZER_EXCEPTIONS.update({
+    "map": [
+        {ORTH: "m", NORM: "mwen"},
+        {ORTH: "ap", NORM: "ap"},
+    ],
+    "Map": [
+        {ORTH: "M", NORM: "Mwen"},
+        {ORTH: "ap", NORM: "ap"},
+    ],
+    "lem": [
+        {ORTH: "le", NORM: "le"},
+        {ORTH: "m", NORM: "mwen"},
+    ],
+    "Lem": [
+        {ORTH: "Le", NORM: "Le"},
+        {ORTH: "m", NORM: "mwen"},
+    ],
+    "lew": [
+        {ORTH: "le", NORM: "le"},
+        {ORTH: "w", NORM: "ou"},
+    ],
+    "Lew": [
+        {ORTH: "Le", NORM: "Le"},
+        {ORTH: "w", NORM: "ou"},
+    ],
+    "nap": [
+        {ORTH: "n", NORM: "nou"},
+        {ORTH: "ap", NORM: "ap"},
+    ],
+    "Nap": [
+        {ORTH: "N", NORM: "Nou"},
+        {ORTH: "ap", NORM: "ap"},
+    ],
+    "lap": [
+        {ORTH: "l", NORM: "li"},
+        {ORTH: "ap", NORM: "ap"},
+    ],
+    "Lap": [
+        {ORTH: "L", NORM: "Li"},
+        {ORTH: "ap", NORM: "ap"},
+    ],
+    "yap": [
+        {ORTH: "y", NORM: "yo"},
+        {ORTH: "ap", NORM: "ap"},
+    ],
+    "Yap": [
+        {ORTH: "Y", NORM: "Yo"},
+        {ORTH: "ap", NORM: "ap"},
+    ],
+    "mte": [
+        {ORTH: "m", NORM: "mwen"},
+        {ORTH: "te", NORM: "te"},
+    ],
+    "Mte": [
+        {ORTH: "M", NORM: "Mwen"},
+        {ORTH: "te", NORM: "te"},
+    ],
+    "mpral": [
+        {ORTH: "m", NORM: "mwen"},
+        {ORTH: "pral", NORM: "pral"},
+    ],
+    "Mpral": [
+        {ORTH: "M", NORM: "Mwen"},
+        {ORTH: "pral", NORM: "pral"},
+    ],
+    "wap": [
+        {ORTH: "w", NORM: "ou"},
+        {ORTH: "ap", NORM: "ap"},
+    ],
+    "Wap": [
+        {ORTH: "W", NORM: "Ou"},
+        {ORTH: "ap", NORM: "ap"},
+    ],
+    "kap": [
+        {ORTH: "k", NORM: "ki"},
+        {ORTH: "ap", NORM: "ap"},
+    ],
+    "Kap": [
+        {ORTH: "K", NORM: "Ki"},
+        {ORTH: "ap", NORM: "ap"},
+    ],
+    "tap": [
+        {ORTH: "t", NORM: "te"},
+        {ORTH: "ap", NORM: "ap"},
+    ],
+    "Tap": [
+        {ORTH: "T", NORM: "Te"},
+        {ORTH: "ap", NORM: "ap"},
+    ],
+})
--- a/spacy/language.py
+++ b/spacy/language.py
@ -141,7 +141,7 @@ class Language:

    Defaults (class): Settings, data and factory methods for creating the `nlp`
        object and processing pipeline.
-    lang (str): IETF language code, such as 'en'.
+    lang (str): Two-letter ISO 639-1 or three-letter ISO 639-3 language codes, such as 'en' and 'eng'.

    DOCS: https://spacy.io/api/language
    """
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -212,6 +212,16 @@ def hr_tokenizer():
    return get_lang_class("hr")().tokenizer


+@pytest.fixture(scope="session")
+def ht_tokenizer():
+    return get_lang_class("ht")().tokenizer
+
+
+@pytest.fixture(scope="session")
+def ht_vocab():
+    return get_lang_class("ht")().vocab
+
+
@pytest.fixture
 def hu_tokenizer():
    return get_lang_class("hu")().tokenizer
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@ -49,7 +49,7 @@ def doc_not_parsed(en_tokenizer):
 def test_issue1537():
    """Test that Span.as_doc() doesn't segfault."""
    string = "The sky is blue . The man is pink . The dog is purple ."
-    doc = Doc(Vocab(), words=string.split())
+    doc = Doc(Vocab(), words=list(string.split()))
    doc[0].sent_start = True
    for word in doc[1:]:
        if word.nbor(-1).text == ".":
@ -225,6 +225,21 @@ def test_spans_span_sent(doc, doc_not_parsed):
    assert doc_not_parsed[10:14].sent == doc_not_parsed[5:]


+def test_issue13769():
+    # Test issue 13769: Incorrect output of span.sents when final token is a sentence outside of the span.
+    doc = Doc(
+        Vocab(),
+        words=list("This is a sentence . This is another sentence . Third".split()),
+    )
+    doc[0].is_sent_start = True
+    doc[5].is_sent_start = True
+    doc[10].is_sent_start = True
+    doc.ents = [("ENTITY", 7, 9)]  # "another sentence" phrase in the second sentence
+    entity = doc.ents[0]
+    ent_sents = list(entity.sents)
+    assert len(ent_sents) == 1
+
+
@pytest.mark.parametrize(
    "start,end,expected_sentence",
    [
--- a/spacy/tests/lang/ht/init.py
+++ b/spacy/tests/lang/ht/init.py
--- a/spacy/tests/lang/ht/test_exceptions.py
+++ b/spacy/tests/lang/ht/test_exceptions.py
@ -0,0 +1,32 @@
+import pytest
+
+
+def test_ht_tokenizer_handles_basic_contraction(ht_tokenizer):
+    text = "m'ap ri"
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 3
+    assert tokens[0].text == "m'"
+    assert tokens[1].text == "ap"
+    assert tokens[2].text == "ri"
+
+    text = "mwen di'w non!"
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 5
+    assert tokens[0].text == "mwen"
+    assert tokens[1].text == "di"
+    assert tokens[2].text == "'w"
+    assert tokens[3].text == "non"
+    assert tokens[4].text == "!"
+
+
+@pytest.mark.parametrize("text", ["Dr."])
+def test_ht_tokenizer_handles_basic_abbreviation(ht_tokenizer, text):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 1
+    assert tokens[0].text == text
+
+
+def test_ht_tokenizer_full_sentence(ht_tokenizer):
+    text = "Si'm ka vini, m'ap pale ak li."
+    tokens = [t.text for t in ht_tokenizer(text)]
+    assert tokens == ["Si", "'m", "ka", "vini", ",", "m'", "ap", "pale", "ak", "li", "."]
--- a/spacy/tests/lang/ht/test_noun_chunks.py
+++ b/spacy/tests/lang/ht/test_noun_chunks.py
@ -0,0 +1,44 @@
+import pytest
+from spacy.tokens import Doc
+
+
+@pytest.fixture
+def doc(ht_vocab):
+    words = ["Pitit", "gen", "gwo", "pwoblèm", "ak", "kontwòl"]
+    heads = [1, 1, 5, 5, 3, 3]
+    deps = ["nsubj", "ROOT", "amod", "obj", "case", "nmod"]
+    pos = ["NOUN", "VERB", "ADJ", "NOUN", "ADP", "NOUN"]
+    return Doc(ht_vocab, words=words, heads=heads, deps=deps, pos=pos)
+
+
+def test_noun_chunks_is_parsed(ht_tokenizer):
+    """Test that noun_chunks raises Value Error for 'ht' language if Doc is not parsed."""
+    doc = ht_tokenizer("Sa a se yon fraz")
+    with pytest.raises(ValueError):
+        list(doc.noun_chunks)
+
+
+def test_ht_noun_chunks_not_nested(doc, ht_vocab):
+    """Test that each token only appears in one noun chunk at most"""
+    word_occurred = {}
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) > 1
+    for chunk in chunks:
+        for word in chunk:
+            word_occurred.setdefault(word.text, 0)
+            word_occurred[word.text] += 1
+    assert len(word_occurred) > 0
+    for word, freq in word_occurred.items():
+        assert freq == 1, (word, [chunk.text for chunk in doc.noun_chunks])
+
+
+def test_noun_chunks_span(doc, ht_tokenizer):
+    """Test that the span.noun_chunks property works correctly"""
+    doc_chunks = list(doc.noun_chunks)
+    span = doc[0:3]
+    span_chunks = list(span.noun_chunks)
+    assert 0 < len(span_chunks) < len(doc_chunks)
+    for chunk in span_chunks:
+        assert chunk in doc_chunks
+        assert chunk.start >= 0
+        assert chunk.end <= 3
--- a/spacy/tests/lang/ht/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/ht/test_prefix_suffix_infix.py
@ -0,0 +1,130 @@
+import pytest
+
+
+@pytest.mark.parametrize("text", ["(ka)"])
+def test_ht_tokenizer_splits_no_special(ht_tokenizer, text):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize("text", ["m'ap"])
+def test_ht_tokenizer_splits_no_punct(ht_tokenizer, text):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 2
+
+
+@pytest.mark.parametrize("text", ["(m'ap"])
+def test_ht_tokenizer_splits_prefix_punct(ht_tokenizer, text):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize("text", ["m'ap)"])
+def test_ht_tokenizer_splits_suffix_punct(ht_tokenizer, text):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize("text", ["(m'ap)"])
+def test_ht_tokenizer_splits_even_wrap(ht_tokenizer, text):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 4
+
+
+@pytest.mark.parametrize("text", ["(m'ap?)"])
+def test_ht_tokenizer_splits_uneven_wrap(ht_tokenizer, text):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 5
+
+
+@pytest.mark.parametrize("text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)])
+def test_ht_tokenizer_splits_prefix_interact(ht_tokenizer, text, length):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == length
+
+
+@pytest.mark.parametrize("text", ["Ozetazini.)"])
+def test_ht_tokenizer_splits_suffix_interact(ht_tokenizer, text):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize("text", ["(Ozetazini.)"])
+def test_ht_tokenizer_splits_even_wrap_interact(ht_tokenizer, text):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 4
+
+
+@pytest.mark.parametrize("text", ["(Ozetazini?)"])
+def test_ht_tokenizer_splits_uneven_wrap_interact(ht_tokenizer, text):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 4
+
+
+@pytest.mark.parametrize("text", ["pi-bon"])
+def test_ht_tokenizer_splits_hyphens(ht_tokenizer, text):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize("text", ["0.1-13.5", "0.0-0.1", "103.27-300"])
+def test_ht_tokenizer_splits_numeric_range(ht_tokenizer, text):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize("text", ["pi.Bon", "Bon.Jour"])
+def test_ht_tokenizer_splits_period_infix(ht_tokenizer, text):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize("text", ["Bonjou,moun", "youn,de"])
+def test_ht_tokenizer_splits_comma_infix(ht_tokenizer, text):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 3
+    assert tokens[0].text == text.split(",")[0]
+    assert tokens[1].text == ","
+    assert tokens[2].text == text.split(",")[1]
+
+
+@pytest.mark.parametrize("text", ["pi...Bon", "pi...bon"])
+def test_ht_tokenizer_splits_ellipsis_infix(ht_tokenizer, text):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 3
+
+
+def test_ht_tokenizer_splits_double_hyphen_infix(ht_tokenizer):
+    tokens = ht_tokenizer("Pa vrè--men ou konnen--mwen renmen w.")
+    assert tokens[0].text == "Pa"
+    assert tokens[1].text == "vrè"
+    assert tokens[2].text == "--"
+    assert tokens[3].text == "men"
+    assert tokens[4].text == "ou"
+    assert tokens[5].text == "konnen"
+    assert tokens[6].text == "--"
+    assert tokens[7].text == "mwen"
+    assert tokens[8].text == "renmen"
+    assert tokens[9].text == "w"
+    assert tokens[10].text == "."
+
+
+def test_ht_tokenizer_splits_period_abbr(ht_tokenizer):
+    text = "Jodi a se Madi.Mr."
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 7
+    assert tokens[0].text == "Jodi"
+    assert tokens[1].text == "a"
+    assert tokens[2].text == "se"
+    assert tokens[3].text == "Madi"
+    assert tokens[4].text == "."
+    assert tokens[5].text == "Mr"
+    assert tokens[6].text == "."
+
+
+def test_ht_tokenizer_splits_paren_period(ht_tokenizer):
+    tokens = ht_tokenizer("M ap teste sa (pou kounye a).")
+    words = [t.text for t in tokens]
+    assert "a" in words
+    assert ")" in words
+    assert "." in words
--- a/spacy/tests/lang/ht/test_text.py
+++ b/spacy/tests/lang/ht/test_text.py
@ -0,0 +1,79 @@
+import pytest
+
+from spacy.lang.ht.lex_attrs import like_num, norm_custom
+
+
+def test_ht_tokenizer_handles_long_text(ht_tokenizer):
+    text = """Onè ap fèt pou ansyen lidè Pati Travayè Britanik
+
+Moun atravè lemond ap voye onè pou ansyen lidè
+Pati Travayè a, John Smith, ki mouri pi bonè jodi a apre li te fè yon gwo kriz kadyak a laj 55 an.
+
+Nan Washington, Depatman Deta Etazini pibliye yon deklarasyon ki eksprime "regre lanmò twò bonè" avoka ak palmantè eskoze a.
+
+"Misye Smith, pandan tout karyè li ki te make ak distenksyon"""
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 84
+
+
+
+@pytest.mark.parametrize(
+    "text,length",
+    [
+        ("Map manje gato a pandan map gade televizyon lem lakay mwen.", 15),
+        ("M'ap vini, eske wap la avek lajan'm? Si oui, di'l non pou fre'w.", 22),
+        ("M ap teste sa (pou kounye a).", 10),
+    ],
+)
+def test_ht_tokenizer_handles_cnts(ht_tokenizer, text, length):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == length
+
+
+@pytest.mark.parametrize(
+    "text,match",
+    [
+        ("10", True),
+        ("1", True),
+        ("10,000", True),
+        ("10,00", True),
+        ("999.0", True),
+        ("en", True),
+        ("de", True),
+        ("milya", True),
+        ("dog", False),
+        (",", False),
+        ("1/2", True),
+    ],
+)
+def test_lex_attrs_like_number(ht_tokenizer, text, match):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 1
+    assert tokens[0].like_num == match
+
+
+@pytest.mark.parametrize(
+    "word", ["ventyèm", "Milyonnyèm", "3yèm", "Santyèm", "25yèm", "52yèm"]
+)
+def test_ht_lex_attrs_like_number_for_ordinal(word):
+    assert like_num(word)
+
+
+@pytest.mark.parametrize("word", ["onz"])
+def test_ht_lex_attrs_capitals(word):
+    assert like_num(word)
+    assert like_num(word.upper())
+
+
+@pytest.mark.parametrize(
+    "word, expected", [
+        ("'m", "mwen"),
+        ("'n", "nou"),
+        ("'l", "li"),
+        ("'y", "yo"),
+        ("'w", "ou"),
+    ]
+)
+def test_ht_lex_attrs_norm_custom(word, expected):
+    assert norm_custom(word) == expected
+
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@ -656,17 +656,12 @@ def test_spacy_blank():
@pytest.mark.parametrize(
    "lang,target",
    [
-        ("en", "en"),
        ("fra", "fr"),
        ("fre", "fr"),
        ("iw", "he"),
        ("mo", "ro"),
+        ("scc", "sr"),
        ("mul", "xx"),
-        ("no", "nb"),
-        ("pt-BR", "pt"),
-        ("xx", "xx"),
-        ("zh-Hans", "zh"),
-        ("zh-Hant", None),
        ("zxx", None),
    ],
 )
@ -686,11 +681,9 @@ def test_language_matching(lang, target):
        ("fre", "fr"),
        ("iw", "he"),
        ("mo", "ro"),
+        ("scc", "sr"),
        ("mul", "xx"),
-        ("no", "nb"),
-        ("pt-BR", "pt"),
        ("xx", "xx"),
-        ("zh-Hans", "zh"),
    ],
 )
 def test_blank_languages(lang, target):
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -479,10 +479,11 @@ cdef class Span:
                        break
                elif i == self.doc.length - 1:
                    yield Span(self.doc, start, self.doc.length)
-
-            # Ensure that trailing parts of the Span instance are included in last element of .sents.
-            if start == self.doc.length - 1:
-                yield Span(self.doc, start, self.doc.length)
+            else:
+                # Ensure that trailing parts of the Span instance are included in last element of .sents.
+                # We only want to do this if we didn't break above
+                if start == self.doc.length - 1:
+                    yield Span(self.doc, start, self.doc.length)

    @property
    def ents(self):
--- a/spacy/util.py
+++ b/spacy/util.py
@ -5,7 +5,6 @@ import inspect
 import itertools
 import logging
 import os
-import pkgutil
 import re
 import shlex
 import shutil
@ -40,7 +39,6 @@ from typing import (
 )

 import catalogue
-import langcodes
 import numpy
 import srsly
 import thinc
@ -89,6 +87,83 @@ LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "grc", "id", "lb", "mk", "pt"
 # Default order of sections in the config file. Not all sections needs to exist,
 # and additional sections are added at the end, in alphabetical order.
 CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"]
+
+LANG_ALIASES = {
+    "af": ["afr"],
+    "am": ["amh"],
+    "ar": ["ara"],
+    "az": ["aze"],
+    "bg": ["bul"],
+    "bn": ["ben"],
+    "bo": ["bod", "tib"],
+    "ca": ["cat"],
+    "cs": ["ces", "cze"],
+    "da": ["dan"],
+    "de": ["deu", "ger"],
+    "el": ["ell", "gre"],
+    "en": ["eng"],
+    "es": ["spa"],
+    "et": ["est"],
+    "eu": ["eus", "baq"],
+    "fa": ["fas", "per"],
+    "fi": ["fin"],
+    "fo": ["fao"],
+    "fr": ["fra", "fre"],
+    "ga": ["gle"],
+    "gd": ["gla"],
+    "gu": ["guj"],
+    "he": ["heb", "iw"], # "iw" is the obsolete ISO 639-1 code for Hebrew
+    "hi": ["hin"],
+    "hr": ["hrv", "scr"], # "scr" is the deprecated ISO 639-2/B for Croatian
+    "hu": ["hun"],
+    "hy": ["hye"],
+    "id": ["ind", "in"], # "in" is the obsolete ISO 639-1 code for Hebrew
+    "is": ["isl", "ice"],
+    "it": ["ita"],
+    "ja": ["jpn"],
+    "kn": ["kan"],
+    "ko": ["kor"],
+    "ky": ["kir"],
+    "la": ["lat"],
+    "lb": ["ltz"],
+    "lg": ["lug"],
+    "lt": ["lit"],
+    "lv": ["lav"],
+    "mk": ["mkd", "mac"],
+    "ml": ["mal"],
+    "mr": ["mar"],
+    "ms": ["msa", "may"],
+    "nb": ["nob"],
+    "ne": ["nep"],
+    "nl": ["nld", "dut"],
+    "nn": ["nno"],
+    "pl": ["pol"],
+    "pt": ["por"],
+    "ro": ["ron", "rom", "mo", "mol"], # "mo" and "mol" are deprecated codes for Moldavian
+    "ru": ["rus"],
+    "sa": ["san"],
+    "si": ["sin"],
+    "sk": ["slk", "slo"],
+    "sl": ["slv"],
+    "sq": ["sqi", "alb"],
+    "sr": ["srp", "scc"], # "scc" is the deprecated ISO 639-2/B code for Serbian
+    "sv": ["swe"],
+    "ta": ["tam"],
+    "te": ["tel"],
+    "th": ["tha"],
+    "ti": ["tir"],
+    "tl": ["tgl"],
+    "tn": ["tsn"],
+    "tr": ["tur"],
+    "tt": ["tat"],
+    "uk": ["ukr"],
+    "ur": ["urd"],
+    "vi": ["viw"],
+    "yo": ["yor"],
+    "zh": ["zho", "chi"],
+
+    "xx": ["mul"],
+}
 # fmt: on

 logger = logging.getLogger("spacy")
@ -305,63 +380,39 @@ def lang_class_is_loaded(lang: str) -> bool:

 def find_matching_language(lang: str) -> Optional[str]:
    """
-    Given an IETF language code, find a supported spaCy language that is a
-    close match for it (according to Unicode CLDR language-matching rules).
-    This allows for language aliases, ISO 639-2 codes, more detailed language
-    tags, and close matches.
+    Given a two-letter ISO 639-1 or three-letter ISO 639-3 language code,
+    find a supported spaCy language.

    Returns the language code if a matching language is available, or None
    if there is no matching language.

-    >>> find_matching_language('en')
-    'en'
-    >>> find_matching_language('pt-BR')  # Brazilian Portuguese
-    'pt'
-    >>> find_matching_language('fra')  # an ISO 639-2 code for French
+    >>> find_matching_language('fra')  # ISO 639-3 code for French
    'fr'
-    >>> find_matching_language('iw')  # obsolete alias for Hebrew
+    >>> find_matching_language('fre')  # ISO 639-2/B code for French
+    'fr'
+    >>> find_matching_language('iw')  # Obsolete ISO 639-1 code for Hebrew
    'he'
-    >>> find_matching_language('no')  # Norwegian
-    'nb'
-    >>> find_matching_language('mo')  # old code for ro-MD
+    >>> find_matching_language('mo')  # Deprecated code for Moldavian
    'ro'
-    >>> find_matching_language('zh-Hans')  # Simplified Chinese
-    'zh'
+    >>> find_matching_language('scc')  # Deprecated ISO 639-2/B code for Serbian
+    'sr'
    >>> find_matching_language('zxx')
    None
    """
    import spacy.lang  # noqa: F401

-    if lang == "xx":
-        return "xx"
+    # Check aliases
+    for lang_code, aliases in LANG_ALIASES.items():
+        if lang in aliases:
+            return lang_code

-    # Find out which language modules we have
-    possible_languages = []
-    for modinfo in pkgutil.iter_modules(spacy.lang.__path__):  # type: ignore[attr-defined]
-        code = modinfo.name
-        if code == "xx":
-            # Temporarily make 'xx' into a valid language code
-            possible_languages.append("mul")
-        elif langcodes.tag_is_valid(code):
-            possible_languages.append(code)
-
-    # Distances from 1-9 allow near misses like Bosnian -> Croatian and
-    # Norwegian -> Norwegian Bokmål. A distance of 10 would include several
-    # more possibilities, like variants of Chinese like 'wuu', but text that
-    # is labeled that way is probably trying to be distinct from 'zh' and
-    # shouldn't automatically match.
-    match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9)
-    if match == "mul":
-        # Convert 'mul' back to spaCy's 'xx'
-        return "xx"
-    else:
-        return match
+    return None


 def get_lang_class(lang: str) -> Type["Language"]:
    """Import and load a Language class.

-    lang (str): IETF language code, such as 'en'.
+    lang (str): Two-letter ISO 639-1 or three-letter ISO 639-3 language code, such as 'en' and 'eng'.
    RETURNS (Language): Language class.
    """
    # Check if language is registered / entry point is available
@ -372,13 +423,9 @@ def get_lang_class(lang: str) -> Type["Language"]:
        try:
            module = importlib.import_module(f".lang.{lang}", "spacy")
        except ImportError as err:
-            # Find a matching language. For example, if the language 'no' is
-            # requested, we can use language-matching to load `spacy.lang.nb`.
-            try:
-                match = find_matching_language(lang)
-            except langcodes.tag_parser.LanguageTagError:
-                # proceed to raising an import error
-                match = None
+            # Find a matching language. For example, if the language 'eng' is
+            # requested, we can use language-matching to load `spacy.lang.en`.
+            match = find_matching_language(lang)

            if match:
                lang = match
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@ -230,7 +230,7 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr

 | Name               | Description                                                                                                                                                                                                                                                         |
 | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `lang`             | Pipeline language [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as `en`. ~~str (positional)~~                                                                                                                                 |
+| `lang`             | Pipeline language. Two-letter [ISO 639-1 code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3 code](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes), such as `en` and `eng`. ~~str (positional)~~                                                                                                                                 |
 | `vectors_loc`      | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ |
 | `output_dir`       | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~                                                                                                                                                                               |
 | `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~                                                                                                                                                  |
--- a/website/docs/api/language.mdx
+++ b/website/docs/api/language.mdx
@ -1078,7 +1078,7 @@ details.
 | Name             | Description                                                                                                                                                                       |
 | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `Defaults`       | Settings, data and factory methods for creating the `nlp` object and processing pipeline. ~~Defaults~~                                                                            |
-| `lang`           | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en' for English. ~~str~~                                                                  |
+| `lang`           | Two-letter [ISO 639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes) language codes, such as 'en' and 'eng' for English. ~~str~~                                                                  |
 | `default_config` | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](%%GITHUB_SPACY/spacy/default_config.cfg). ~~Config~~ |

 ## Defaults {id="defaults"}
--- a/website/docs/api/span.mdx
+++ b/website/docs/api/span.mdx
@ -561,7 +561,7 @@ overlaps with will be returned.
 | `orth_`        | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes. ~~str~~            |
 | `label`        | The hash value of the span's label. ~~int~~                                                                                   |
 | `label_`       | The span's label. ~~str~~                                                                                                     |
-| `lemma_`       | The span's lemma. Equivalent to `"".join(token.text_with_ws for token in span)`. ~~str~~                                      |
+| `lemma_`       | The span's lemma. Equivalent to `"".join(token.lemma_ + token.whitespace_ for token in span).strip()`. ~~str~~                                      |
 | `kb_id`        | The hash value of the knowledge base ID referred to by the span. ~~int~~                                                      |
 | `kb_id_`       | The knowledge base ID referred to by the span. ~~str~~                                                                        |
 | `ent_id`       | The hash value of the named entity the root token is an instance of. ~~int~~                                                  |
--- a/website/docs/api/top-level.mdx
+++ b/website/docs/api/top-level.mdx
@ -86,7 +86,7 @@ Create a blank pipeline of a given language class. This function is the twin of

 | Name                                | Description                                                                                                                                                        |
 | ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `name`                              | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en', of the language class to load. ~~str~~                                |
+| `name`                              | Two-letter [ISO 639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes) language codes, such as 'en' and 'eng', of the language class to load. ~~str~~                                |
 | _keyword-only_                      |                                                                                                                                                                    |
 | `vocab`                             | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~                              |
 | `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
--- a/website/docs/api/vectors.mdx
+++ b/website/docs/api/vectors.mdx
@ -51,7 +51,7 @@ modified later.
 | `strings`                                 | The string store. A new string store is created if one is not provided. Defaults to `None`. ~~Optional[StringStore]~~                                                                  |
 | `shape`                                   | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. ~~Tuple[int, int]~~ |
 | `data`                                    | The vector data. ~~numpy.ndarray[ndim=2, dtype=float32]~~                                                                                                                              |
-| `keys`                                    | A sequence of keys aligned with the data. ~~Iterable[Union[str, int]]~~                                                                                                                |
+| `keys`                                    | An iterable of keys aligned with the data. ~~Iterable[Union[str, int]]~~                                                                                                               |
 | `name`                                    | A name to identify the vectors table. ~~str~~                                                                                                                                          |
 | `mode` <Tag variant="new">3.2</Tag>       | Vectors mode: `"default"` or [`"floret"`](https://github.com/explosion/floret) (default: `"default"`). ~~str~~                                                                         |
 | `minn` <Tag variant="new">3.2</Tag>       | The floret char ngram minn (default: `0`). ~~int~~                                                                                                                                     |
--- a/website/docs/api/vocab.mdx
+++ b/website/docs/api/vocab.mdx
@ -283,7 +283,7 @@ Serialize the current state to a binary string.
 | -------------- | ------------------------------------------------------------------------------------------- |
 | _keyword-only_ |                                                                                             |
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
-| **RETURNS**    | The serialized form of the `Vocab` object. ~~Vocab~~                                        |
+| **RETURNS**    | The serialized form of the `Vocab` object. ~~bytes~~                                        |

 ## Vocab.from_bytes {id="from_bytes",tag="method"}