diff --git a/requirements.txt b/requirements.txt
index e55d25a19..7fc8ab32e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -17,7 +17,6 @@ requests>=2.13.0,<3.0.0
tqdm>=4.38.0,<5.0.0
pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
jinja2
-langcodes>=3.2.0,<4.0.0
# Official Python utilities
setuptools
packaging>=20.0
diff --git a/setup.cfg b/setup.cfg
index f00b5408e..f4d50d424 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -65,7 +65,6 @@ install_requires =
# Official Python utilities
setuptools
packaging>=20.0
- langcodes>=3.2.0,<4.0.0
[options.entry_points]
console_scripts =
diff --git a/spacy/lang/ht/__init__.py b/spacy/lang/ht/__init__.py
new file mode 100644
index 000000000..e5c1c2770
--- /dev/null
+++ b/spacy/lang/ht/__init__.py
@@ -0,0 +1,52 @@
+from typing import Callable, Optional
+
+from thinc.api import Model
+
+from ...language import BaseDefaults, Language
+from .lemmatizer import HaitianCreoleLemmatizer
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from .tag_map import TAG_MAP
+
+
+class HaitianCreoleDefaults(BaseDefaults):
+ tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+ prefixes = TOKENIZER_PREFIXES
+ infixes = TOKENIZER_INFIXES
+ suffixes = TOKENIZER_SUFFIXES
+ lex_attr_getters = LEX_ATTRS
+ syntax_iterators = SYNTAX_ITERATORS
+ stop_words = STOP_WORDS
+ tag_map = TAG_MAP
+
+class HaitianCreole(Language):
+ lang = "ht"
+ Defaults = HaitianCreoleDefaults
+
+@HaitianCreole.factory(
+ "lemmatizer",
+ assigns=["token.lemma"],
+ default_config={
+ "model": None,
+ "mode": "rule",
+ "overwrite": False,
+ "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+ },
+ default_score_weights={"lemma_acc": 1.0},
+)
+def make_lemmatizer(
+ nlp: Language,
+ model: Optional[Model],
+ name: str,
+ mode: str,
+ overwrite: bool,
+ scorer: Optional[Callable],
+):
+ return HaitianCreoleLemmatizer(
+ nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+ )
+
+__all__ = ["HaitianCreole"]
diff --git a/spacy/lang/ht/examples.py b/spacy/lang/ht/examples.py
new file mode 100644
index 000000000..456d34a5f
--- /dev/null
+++ b/spacy/lang/ht/examples.py
@@ -0,0 +1,18 @@
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.ht.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+ "Apple ap panse achte yon demaraj nan Wayòm Ini pou $1 milya dola",
+ "Machin otonòm fè responsablite asirans lan ale sou men fabrikan yo",
+ "San Francisco ap konsidere entèdi robo ki livre sou twotwa yo",
+ "Lond se yon gwo vil nan Wayòm Ini",
+ "Kote ou ye?",
+ "Kilès ki prezidan Lafrans?",
+ "Ki kapital Etazini?",
+ "Kile Barack Obama te fèt?",
+]
diff --git a/spacy/lang/ht/lemmatizer.py b/spacy/lang/ht/lemmatizer.py
new file mode 100644
index 000000000..9ac096f6d
--- /dev/null
+++ b/spacy/lang/ht/lemmatizer.py
@@ -0,0 +1,51 @@
+from typing import List, Tuple
+
+from ...pipeline import Lemmatizer
+from ...tokens import Token
+from ...lookups import Lookups
+
+
+class HaitianCreoleLemmatizer(Lemmatizer):
+ """
+ Minimal Haitian Creole lemmatizer.
+ Returns a word's base form based on rules and lookup,
+ or defaults to the original form.
+ """
+
+ def is_base_form(self, token: Token) -> bool:
+ morph = token.morph.to_dict()
+ upos = token.pos_.lower()
+
+ # Consider unmarked forms to be base
+ if upos in {"noun", "verb", "adj", "adv"}:
+ if not morph:
+ return True
+ if upos == "noun" and morph.get("Number") == "Sing":
+ return True
+ if upos == "verb" and morph.get("VerbForm") == "Inf":
+ return True
+ if upos == "adj" and morph.get("Degree") == "Pos":
+ return True
+ return False
+
+ def rule_lemmatize(self, token: Token) -> List[str]:
+ string = token.text.lower()
+ pos = token.pos_.lower()
+ cache_key = (token.orth, token.pos)
+ if cache_key in self.cache:
+ return self.cache[cache_key]
+
+ forms = []
+
+ # fallback rule: just return lowercased form
+ forms.append(string)
+
+ self.cache[cache_key] = forms
+ return forms
+
+ @classmethod
+ def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
+ if mode == "rule":
+ required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
+ return (required, [])
+ return super().get_lookups_config(mode)
diff --git a/spacy/lang/ht/lex_attrs.py b/spacy/lang/ht/lex_attrs.py
new file mode 100644
index 000000000..8a3ec1ff9
--- /dev/null
+++ b/spacy/lang/ht/lex_attrs.py
@@ -0,0 +1,78 @@
+from ...attrs import LIKE_NUM, NORM
+
+# Cardinal numbers in Creole
+_num_words = set(
+ """
+zewo youn en de twa kat senk sis sèt uit nèf dis
+onz douz trèz katoz kenz sèz disèt dizwit diznèf
+vent trant karant sinkant swasant swasann-dis
+san mil milyon milya
+""".split()
+)
+
+# Ordinal numbers in Creole (some are French-influenced, some simplified)
+_ordinal_words = set(
+ """
+premye dezyèm twazyèm katryèm senkyèm sizyèm sètvyèm uitvyèm nèvyèm dizyèm
+onzèm douzyèm trèzyèm katozyèm kenzèm sèzyèm disetyèm dizwityèm diznèvyèm
+ventyèm trantyèm karantyèm sinkantyèm swasantyèm
+swasann-disyèm santyèm milyèm milyonnyèm milyadyèm
+""".split()
+)
+
+NORM_MAP = {
+ "'m": "mwen",
+ "'w": "ou",
+ "'l": "li",
+ "'n": "nou",
+ "'y": "yo",
+ "’m": "mwen",
+ "’w": "ou",
+ "’l": "li",
+ "’n": "nou",
+ "’y": "yo",
+ "m": "mwen",
+ "n": "nou",
+ "l": "li",
+ "y": "yo",
+ "w": "ou",
+ "t": "te",
+ "k": "ki",
+ "p": "pa",
+ "M": "Mwen",
+ "N": "Nou",
+ "L": "Li",
+ "Y": "Yo",
+ "W": "Ou",
+ "T": "Te",
+ "K": "Ki",
+ "P": "Pa",
+}
+
+def like_num(text):
+ text = text.strip().lower()
+ if text.startswith(("+", "-", "±", "~")):
+ text = text[1:]
+ text = text.replace(",", "").replace(".", "")
+ if text.isdigit():
+ return True
+ if text.count("/") == 1:
+ num, denom = text.split("/")
+ if num.isdigit() and denom.isdigit():
+ return True
+ if text in _num_words:
+ return True
+ if text in _ordinal_words:
+ return True
+ # Handle things like "3yèm", "10yèm", "25yèm", etc.
+ if text.endswith("yèm") and text[:-3].isdigit():
+ return True
+ return False
+
+def norm_custom(text):
+ return NORM_MAP.get(text, text.lower())
+
+LEX_ATTRS = {
+ LIKE_NUM: like_num,
+ NORM: norm_custom,
+}
diff --git a/spacy/lang/ht/punctuation.py b/spacy/lang/ht/punctuation.py
new file mode 100644
index 000000000..61d88d6e1
--- /dev/null
+++ b/spacy/lang/ht/punctuation.py
@@ -0,0 +1,43 @@
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ HYPHENS,
+ LIST_PUNCT,
+ LIST_QUOTES,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+ merge_chars,
+)
+
+ELISION = "'’".replace(" ", "")
+
+_prefixes_elision = "m n l y t k w"
+_prefixes_elision += " " + _prefixes_elision.upper()
+
+TOKENIZER_PREFIXES = LIST_PUNCT + LIST_QUOTES + [
+ r"(?:({pe})[{el}])(?=[{a}])".format(
+ a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
+ )
+]
+
+TOKENIZER_SUFFIXES = LIST_PUNCT + LIST_QUOTES + LIST_ELLIPSES + [
+ r"(?<=[0-9])%", # numbers like 10%
+ r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers
+ r"(?<=[{a}])['’]".format(a=ALPHA), # apostrophes after letters
+ r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions
+ r"(?<=[{a}0-9])\)", # right parenthesis after letter/number
+ r"(?<=[{a}])\.(?=\s|$)".format(a=ALPHA), # period after letter if space or end of string
+ r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis
+]
+
+TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [
+ r"(?<=[0-9])[+\-\*^](?=[0-9-])",
+ r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
+ al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
+ ),
+ r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+ r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
+ r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
+]
diff --git a/spacy/lang/ht/stop_words.py b/spacy/lang/ht/stop_words.py
new file mode 100644
index 000000000..6243887a4
--- /dev/null
+++ b/spacy/lang/ht/stop_words.py
@@ -0,0 +1,50 @@
+STOP_WORDS = set(
+ """
+a ak an ankò ant apre ap atò avan avanlè
+byen bò byenke
+
+chak
+
+de depi deja deja
+
+e en epi èske
+
+fò fòk
+
+gen genyen
+
+ki kisa kilès kote koukou konsa konbyen konn konnen kounye kouman
+
+la l laa le lè li lye lò
+
+m m' mwen
+
+nan nap nou n'
+
+ou oumenm
+
+pa paske pami pandan pito pou pral preske pwiske
+
+se selman si sou sòt
+
+ta tap tankou te toujou tou tan tout toutotan twòp tèl
+
+w w' wi wè
+
+y y' yo yon yonn
+
+non o oh eh
+
+sa san si swa si
+
+men mèsi oswa osinon
+
+"""
+.split()
+)
+
+# Add common contractions, with and without apostrophe variants
+contractions = ["m'", "n'", "w'", "y'", "l'", "t'", "k'"]
+for apostrophe in ["'", "’", "‘"]:
+ for word in contractions:
+ STOP_WORDS.add(word.replace("'", apostrophe))
diff --git a/spacy/lang/ht/syntax_iterators.py b/spacy/lang/ht/syntax_iterators.py
new file mode 100644
index 000000000..44ff17f74
--- /dev/null
+++ b/spacy/lang/ht/syntax_iterators.py
@@ -0,0 +1,74 @@
+from typing import Iterator, Tuple, Union
+
+from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
+from ...tokens import Doc, Span
+
+
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
+ """
+ Detect base noun phrases from a dependency parse for Haitian Creole.
+ Works on both Doc and Span objects.
+ """
+
+ # Core nominal dependencies common in Haitian Creole
+ labels = [
+ "nsubj",
+ "obj",
+ "obl",
+ "nmod",
+ "appos",
+ "ROOT",
+ ]
+
+ # Modifiers to optionally include in chunk (to the right)
+ post_modifiers = ["compound", "flat", "flat:name", "fixed"]
+
+ doc = doclike.doc
+ if not doc.has_annotation("DEP"):
+ raise ValueError(Errors.E029)
+
+ np_deps = {doc.vocab.strings.add(label) for label in labels}
+ np_mods = {doc.vocab.strings.add(mod) for mod in post_modifiers}
+ conj_label = doc.vocab.strings.add("conj")
+ np_label = doc.vocab.strings.add("NP")
+ adp_pos = doc.vocab.strings.add("ADP")
+ cc_pos = doc.vocab.strings.add("CCONJ")
+
+ prev_end = -1
+ for i, word in enumerate(doclike):
+ if word.pos not in (NOUN, PROPN, PRON):
+ continue
+ if word.left_edge.i <= prev_end:
+ continue
+
+ if word.dep in np_deps:
+ right_end = word
+ # expand to include known modifiers to the right
+ for child in word.rights:
+ if child.dep in np_mods:
+ right_end = child.right_edge
+ elif child.pos == NOUN:
+ right_end = child.right_edge
+
+ left_index = word.left_edge.i
+ # Skip prepositions at the start
+ if word.left_edge.pos == adp_pos:
+ left_index += 1
+
+ prev_end = right_end.i
+ yield left_index, right_end.i + 1, np_label
+
+ elif word.dep == conj_label:
+ head = word.head
+ while head.dep == conj_label and head.head.i < head.i:
+ head = head.head
+ if head.dep in np_deps:
+ left_index = word.left_edge.i
+ if word.left_edge.pos == cc_pos:
+ left_index += 1
+ prev_end = word.i
+ yield left_index, word.i + 1, np_label
+
+
+SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/lang/ht/tag_map.py b/spacy/lang/ht/tag_map.py
new file mode 100644
index 000000000..8c9cdd6d4
--- /dev/null
+++ b/spacy/lang/ht/tag_map.py
@@ -0,0 +1,21 @@
+from spacy.symbols import NOUN, VERB, AUX, ADJ, ADV, PRON, DET, ADP, SCONJ, CCONJ, PART, INTJ, NUM, PROPN, PUNCT, SYM, X
+
+TAG_MAP = {
+ "NOUN": {"pos": NOUN},
+ "VERB": {"pos": VERB},
+ "AUX": {"pos": AUX},
+ "ADJ": {"pos": ADJ},
+ "ADV": {"pos": ADV},
+ "PRON": {"pos": PRON},
+ "DET": {"pos": DET},
+ "ADP": {"pos": ADP},
+ "SCONJ": {"pos": SCONJ},
+ "CCONJ": {"pos": CCONJ},
+ "PART": {"pos": PART},
+ "INTJ": {"pos": INTJ},
+ "NUM": {"pos": NUM},
+ "PROPN": {"pos": PROPN},
+ "PUNCT": {"pos": PUNCT},
+ "SYM": {"pos": SYM},
+ "X": {"pos": X},
+}
diff --git a/spacy/lang/ht/tokenizer_exceptions.py b/spacy/lang/ht/tokenizer_exceptions.py
new file mode 100644
index 000000000..b44ad7a6f
--- /dev/null
+++ b/spacy/lang/ht/tokenizer_exceptions.py
@@ -0,0 +1,121 @@
+from spacy.symbols import ORTH, NORM
+
+def make_variants(base, first_norm, second_orth, second_norm):
+ return {
+ base: [
+ {ORTH: base.split("'")[0] + "'", NORM: first_norm},
+ {ORTH: second_orth, NORM: second_norm},
+ ],
+ base.capitalize(): [
+ {ORTH: base.split("'")[0].capitalize() + "'", NORM: first_norm.capitalize()},
+ {ORTH: second_orth, NORM: second_norm},
+ ]
+ }
+
+TOKENIZER_EXCEPTIONS = {
+ "Dr.": [{ORTH: "Dr."}]
+}
+
+# Apostrophe forms
+TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap"))
+TOKENIZER_EXCEPTIONS.update(make_variants("n'ap", "nou", "ap", "ap"))
+TOKENIZER_EXCEPTIONS.update(make_variants("l'ap", "li", "ap", "ap"))
+TOKENIZER_EXCEPTIONS.update(make_variants("y'ap", "yo", "ap", "ap"))
+TOKENIZER_EXCEPTIONS.update(make_variants("m'te", "mwen", "te", "te"))
+TOKENIZER_EXCEPTIONS.update(make_variants("m'pral", "mwen", "pral", "pral"))
+TOKENIZER_EXCEPTIONS.update(make_variants("w'ap", "ou", "ap", "ap"))
+TOKENIZER_EXCEPTIONS.update(make_variants("k'ap", "ki", "ap", "ap"))
+TOKENIZER_EXCEPTIONS.update(make_variants("p'ap", "pa", "ap", "ap"))
+TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap"))
+
+# Non-apostrophe contractions (with capitalized variants)
+TOKENIZER_EXCEPTIONS.update({
+ "map": [
+ {ORTH: "m", NORM: "mwen"},
+ {ORTH: "ap", NORM: "ap"},
+ ],
+ "Map": [
+ {ORTH: "M", NORM: "Mwen"},
+ {ORTH: "ap", NORM: "ap"},
+ ],
+ "lem": [
+ {ORTH: "le", NORM: "le"},
+ {ORTH: "m", NORM: "mwen"},
+ ],
+ "Lem": [
+ {ORTH: "Le", NORM: "Le"},
+ {ORTH: "m", NORM: "mwen"},
+ ],
+ "lew": [
+ {ORTH: "le", NORM: "le"},
+ {ORTH: "w", NORM: "ou"},
+ ],
+ "Lew": [
+ {ORTH: "Le", NORM: "Le"},
+ {ORTH: "w", NORM: "ou"},
+ ],
+ "nap": [
+ {ORTH: "n", NORM: "nou"},
+ {ORTH: "ap", NORM: "ap"},
+ ],
+ "Nap": [
+ {ORTH: "N", NORM: "Nou"},
+ {ORTH: "ap", NORM: "ap"},
+ ],
+ "lap": [
+ {ORTH: "l", NORM: "li"},
+ {ORTH: "ap", NORM: "ap"},
+ ],
+ "Lap": [
+ {ORTH: "L", NORM: "Li"},
+ {ORTH: "ap", NORM: "ap"},
+ ],
+ "yap": [
+ {ORTH: "y", NORM: "yo"},
+ {ORTH: "ap", NORM: "ap"},
+ ],
+ "Yap": [
+ {ORTH: "Y", NORM: "Yo"},
+ {ORTH: "ap", NORM: "ap"},
+ ],
+ "mte": [
+ {ORTH: "m", NORM: "mwen"},
+ {ORTH: "te", NORM: "te"},
+ ],
+ "Mte": [
+ {ORTH: "M", NORM: "Mwen"},
+ {ORTH: "te", NORM: "te"},
+ ],
+ "mpral": [
+ {ORTH: "m", NORM: "mwen"},
+ {ORTH: "pral", NORM: "pral"},
+ ],
+ "Mpral": [
+ {ORTH: "M", NORM: "Mwen"},
+ {ORTH: "pral", NORM: "pral"},
+ ],
+ "wap": [
+ {ORTH: "w", NORM: "ou"},
+ {ORTH: "ap", NORM: "ap"},
+ ],
+ "Wap": [
+ {ORTH: "W", NORM: "Ou"},
+ {ORTH: "ap", NORM: "ap"},
+ ],
+ "kap": [
+ {ORTH: "k", NORM: "ki"},
+ {ORTH: "ap", NORM: "ap"},
+ ],
+ "Kap": [
+ {ORTH: "K", NORM: "Ki"},
+ {ORTH: "ap", NORM: "ap"},
+ ],
+ "tap": [
+ {ORTH: "t", NORM: "te"},
+ {ORTH: "ap", NORM: "ap"},
+ ],
+ "Tap": [
+ {ORTH: "T", NORM: "Te"},
+ {ORTH: "ap", NORM: "ap"},
+ ],
+})
diff --git a/spacy/language.py b/spacy/language.py
index 9cdd724f5..5b9eb8bd2 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -141,7 +141,7 @@ class Language:
Defaults (class): Settings, data and factory methods for creating the `nlp`
object and processing pipeline.
- lang (str): IETF language code, such as 'en'.
+ lang (str): Two-letter ISO 639-1 or three-letter ISO 639-3 language codes, such as 'en' and 'eng'.
DOCS: https://spacy.io/api/language
"""
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index e30300a33..ae5255c28 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -212,6 +212,16 @@ def hr_tokenizer():
return get_lang_class("hr")().tokenizer
+@pytest.fixture(scope="session")
+def ht_tokenizer():
+ return get_lang_class("ht")().tokenizer
+
+
+@pytest.fixture(scope="session")
+def ht_vocab():
+ return get_lang_class("ht")().vocab
+
+
@pytest.fixture
def hu_tokenizer():
return get_lang_class("hu")().tokenizer
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 98a74bc21..7167b68ac 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -49,7 +49,7 @@ def doc_not_parsed(en_tokenizer):
def test_issue1537():
"""Test that Span.as_doc() doesn't segfault."""
string = "The sky is blue . The man is pink . The dog is purple ."
- doc = Doc(Vocab(), words=string.split())
+ doc = Doc(Vocab(), words=list(string.split()))
doc[0].sent_start = True
for word in doc[1:]:
if word.nbor(-1).text == ".":
@@ -225,6 +225,21 @@ def test_spans_span_sent(doc, doc_not_parsed):
assert doc_not_parsed[10:14].sent == doc_not_parsed[5:]
+def test_issue13769():
+ # Test issue 13769: Incorrect output of span.sents when final token is a sentence outside of the span.
+ doc = Doc(
+ Vocab(),
+ words=list("This is a sentence . This is another sentence . Third".split()),
+ )
+ doc[0].is_sent_start = True
+ doc[5].is_sent_start = True
+ doc[10].is_sent_start = True
+ doc.ents = [("ENTITY", 7, 9)] # "another sentence" phrase in the second sentence
+ entity = doc.ents[0]
+ ent_sents = list(entity.sents)
+ assert len(ent_sents) == 1
+
+
@pytest.mark.parametrize(
"start,end,expected_sentence",
[
diff --git a/spacy/tests/lang/ht/__init__.py b/spacy/tests/lang/ht/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/ht/test_exceptions.py b/spacy/tests/lang/ht/test_exceptions.py
new file mode 100644
index 000000000..685b72c07
--- /dev/null
+++ b/spacy/tests/lang/ht/test_exceptions.py
@@ -0,0 +1,32 @@
+import pytest
+
+
+def test_ht_tokenizer_handles_basic_contraction(ht_tokenizer):
+ text = "m'ap ri"
+ tokens = ht_tokenizer(text)
+ assert len(tokens) == 3
+ assert tokens[0].text == "m'"
+ assert tokens[1].text == "ap"
+ assert tokens[2].text == "ri"
+
+ text = "mwen di'w non!"
+ tokens = ht_tokenizer(text)
+ assert len(tokens) == 5
+ assert tokens[0].text == "mwen"
+ assert tokens[1].text == "di"
+ assert tokens[2].text == "'w"
+ assert tokens[3].text == "non"
+ assert tokens[4].text == "!"
+
+
+@pytest.mark.parametrize("text", ["Dr."])
+def test_ht_tokenizer_handles_basic_abbreviation(ht_tokenizer, text):
+ tokens = ht_tokenizer(text)
+ assert len(tokens) == 1
+ assert tokens[0].text == text
+
+
+def test_ht_tokenizer_full_sentence(ht_tokenizer):
+ text = "Si'm ka vini, m'ap pale ak li."
+ tokens = [t.text for t in ht_tokenizer(text)]
+ assert tokens == ["Si", "'m", "ka", "vini", ",", "m'", "ap", "pale", "ak", "li", "."]
diff --git a/spacy/tests/lang/ht/test_noun_chunks.py b/spacy/tests/lang/ht/test_noun_chunks.py
new file mode 100644
index 000000000..76c5a1df3
--- /dev/null
+++ b/spacy/tests/lang/ht/test_noun_chunks.py
@@ -0,0 +1,44 @@
+import pytest
+from spacy.tokens import Doc
+
+
+@pytest.fixture
+def doc(ht_vocab):
+ words = ["Pitit", "gen", "gwo", "pwoblèm", "ak", "kontwòl"]
+ heads = [1, 1, 5, 5, 3, 3]
+ deps = ["nsubj", "ROOT", "amod", "obj", "case", "nmod"]
+ pos = ["NOUN", "VERB", "ADJ", "NOUN", "ADP", "NOUN"]
+ return Doc(ht_vocab, words=words, heads=heads, deps=deps, pos=pos)
+
+
+def test_noun_chunks_is_parsed(ht_tokenizer):
+ """Test that noun_chunks raises Value Error for 'ht' language if Doc is not parsed."""
+ doc = ht_tokenizer("Sa a se yon fraz")
+ with pytest.raises(ValueError):
+ list(doc.noun_chunks)
+
+
+def test_ht_noun_chunks_not_nested(doc, ht_vocab):
+ """Test that each token only appears in one noun chunk at most"""
+ word_occurred = {}
+ chunks = list(doc.noun_chunks)
+ assert len(chunks) > 1
+ for chunk in chunks:
+ for word in chunk:
+ word_occurred.setdefault(word.text, 0)
+ word_occurred[word.text] += 1
+ assert len(word_occurred) > 0
+ for word, freq in word_occurred.items():
+ assert freq == 1, (word, [chunk.text for chunk in doc.noun_chunks])
+
+
+def test_noun_chunks_span(doc, ht_tokenizer):
+ """Test that the span.noun_chunks property works correctly"""
+ doc_chunks = list(doc.noun_chunks)
+ span = doc[0:3]
+ span_chunks = list(span.noun_chunks)
+ assert 0 < len(span_chunks) < len(doc_chunks)
+ for chunk in span_chunks:
+ assert chunk in doc_chunks
+ assert chunk.start >= 0
+ assert chunk.end <= 3
diff --git a/spacy/tests/lang/ht/test_prefix_suffix_infix.py b/spacy/tests/lang/ht/test_prefix_suffix_infix.py
new file mode 100644
index 000000000..7dabec17a
--- /dev/null
+++ b/spacy/tests/lang/ht/test_prefix_suffix_infix.py
@@ -0,0 +1,130 @@
+import pytest
+
+
+@pytest.mark.parametrize("text", ["(ka)"])
+def test_ht_tokenizer_splits_no_special(ht_tokenizer, text):
+ tokens = ht_tokenizer(text)
+ assert len(tokens) == 3
+
+
+@pytest.mark.parametrize("text", ["m'ap"])
+def test_ht_tokenizer_splits_no_punct(ht_tokenizer, text):
+ tokens = ht_tokenizer(text)
+ assert len(tokens) == 2
+
+
+@pytest.mark.parametrize("text", ["(m'ap"])
+def test_ht_tokenizer_splits_prefix_punct(ht_tokenizer, text):
+ tokens = ht_tokenizer(text)
+ assert len(tokens) == 3
+
+
+@pytest.mark.parametrize("text", ["m'ap)"])
+def test_ht_tokenizer_splits_suffix_punct(ht_tokenizer, text):
+ tokens = ht_tokenizer(text)
+ assert len(tokens) == 3
+
+
+@pytest.mark.parametrize("text", ["(m'ap)"])
+def test_ht_tokenizer_splits_even_wrap(ht_tokenizer, text):
+ tokens = ht_tokenizer(text)
+ assert len(tokens) == 4
+
+
+@pytest.mark.parametrize("text", ["(m'ap?)"])
+def test_ht_tokenizer_splits_uneven_wrap(ht_tokenizer, text):
+ tokens = ht_tokenizer(text)
+ assert len(tokens) == 5
+
+
+@pytest.mark.parametrize("text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)])
+def test_ht_tokenizer_splits_prefix_interact(ht_tokenizer, text, length):
+ tokens = ht_tokenizer(text)
+ assert len(tokens) == length
+
+
+@pytest.mark.parametrize("text", ["Ozetazini.)"])
+def test_ht_tokenizer_splits_suffix_interact(ht_tokenizer, text):
+ tokens = ht_tokenizer(text)
+ assert len(tokens) == 3
+
+
+@pytest.mark.parametrize("text", ["(Ozetazini.)"])
+def test_ht_tokenizer_splits_even_wrap_interact(ht_tokenizer, text):
+ tokens = ht_tokenizer(text)
+ assert len(tokens) == 4
+
+
+@pytest.mark.parametrize("text", ["(Ozetazini?)"])
+def test_ht_tokenizer_splits_uneven_wrap_interact(ht_tokenizer, text):
+ tokens = ht_tokenizer(text)
+ assert len(tokens) == 4
+
+
+@pytest.mark.parametrize("text", ["pi-bon"])
+def test_ht_tokenizer_splits_hyphens(ht_tokenizer, text):
+ tokens = ht_tokenizer(text)
+ assert len(tokens) == 3
+
+
+@pytest.mark.parametrize("text", ["0.1-13.5", "0.0-0.1", "103.27-300"])
+def test_ht_tokenizer_splits_numeric_range(ht_tokenizer, text):
+ tokens = ht_tokenizer(text)
+ assert len(tokens) == 3
+
+
+@pytest.mark.parametrize("text", ["pi.Bon", "Bon.Jour"])
+def test_ht_tokenizer_splits_period_infix(ht_tokenizer, text):
+ tokens = ht_tokenizer(text)
+ assert len(tokens) == 3
+
+
+@pytest.mark.parametrize("text", ["Bonjou,moun", "youn,de"])
+def test_ht_tokenizer_splits_comma_infix(ht_tokenizer, text):
+ tokens = ht_tokenizer(text)
+ assert len(tokens) == 3
+ assert tokens[0].text == text.split(",")[0]
+ assert tokens[1].text == ","
+ assert tokens[2].text == text.split(",")[1]
+
+
+@pytest.mark.parametrize("text", ["pi...Bon", "pi...bon"])
+def test_ht_tokenizer_splits_ellipsis_infix(ht_tokenizer, text):
+ tokens = ht_tokenizer(text)
+ assert len(tokens) == 3
+
+
+def test_ht_tokenizer_splits_double_hyphen_infix(ht_tokenizer):
+ tokens = ht_tokenizer("Pa vrè--men ou konnen--mwen renmen w.")
+ assert tokens[0].text == "Pa"
+ assert tokens[1].text == "vrè"
+ assert tokens[2].text == "--"
+ assert tokens[3].text == "men"
+ assert tokens[4].text == "ou"
+ assert tokens[5].text == "konnen"
+ assert tokens[6].text == "--"
+ assert tokens[7].text == "mwen"
+ assert tokens[8].text == "renmen"
+ assert tokens[9].text == "w"
+ assert tokens[10].text == "."
+
+
+def test_ht_tokenizer_splits_period_abbr(ht_tokenizer):
+ text = "Jodi a se Madi.Mr."
+ tokens = ht_tokenizer(text)
+ assert len(tokens) == 7
+ assert tokens[0].text == "Jodi"
+ assert tokens[1].text == "a"
+ assert tokens[2].text == "se"
+ assert tokens[3].text == "Madi"
+ assert tokens[4].text == "."
+ assert tokens[5].text == "Mr"
+ assert tokens[6].text == "."
+
+
+def test_ht_tokenizer_splits_paren_period(ht_tokenizer):
+ tokens = ht_tokenizer("M ap teste sa (pou kounye a).")
+ words = [t.text for t in tokens]
+ assert "a" in words
+ assert ")" in words
+ assert "." in words
diff --git a/spacy/tests/lang/ht/test_text.py b/spacy/tests/lang/ht/test_text.py
new file mode 100644
index 000000000..f396e352a
--- /dev/null
+++ b/spacy/tests/lang/ht/test_text.py
@@ -0,0 +1,79 @@
+import pytest
+
+from spacy.lang.ht.lex_attrs import like_num, norm_custom
+
+
+def test_ht_tokenizer_handles_long_text(ht_tokenizer):
+ text = """Onè ap fèt pou ansyen lidè Pati Travayè Britanik
+
+Moun atravè lemond ap voye onè pou ansyen lidè
+Pati Travayè a, John Smith, ki mouri pi bonè jodi a apre li te fè yon gwo kriz kadyak a laj 55 an.
+
+Nan Washington, Depatman Deta Etazini pibliye yon deklarasyon ki eksprime "regre lanmò twò bonè" avoka ak palmantè eskoze a.
+
+"Misye Smith, pandan tout karyè li ki te make ak distenksyon"""
+ tokens = ht_tokenizer(text)
+ assert len(tokens) == 84
+
+
+
+@pytest.mark.parametrize(
+ "text,length",
+ [
+ ("Map manje gato a pandan map gade televizyon lem lakay mwen.", 15),
+ ("M'ap vini, eske wap la avek lajan'm? Si oui, di'l non pou fre'w.", 22),
+ ("M ap teste sa (pou kounye a).", 10),
+ ],
+)
+def test_ht_tokenizer_handles_cnts(ht_tokenizer, text, length):
+ tokens = ht_tokenizer(text)
+ assert len(tokens) == length
+
+
+@pytest.mark.parametrize(
+ "text,match",
+ [
+ ("10", True),
+ ("1", True),
+ ("10,000", True),
+ ("10,00", True),
+ ("999.0", True),
+ ("en", True),
+ ("de", True),
+ ("milya", True),
+ ("dog", False),
+ (",", False),
+ ("1/2", True),
+ ],
+)
+def test_lex_attrs_like_number(ht_tokenizer, text, match):
+ tokens = ht_tokenizer(text)
+ assert len(tokens) == 1
+ assert tokens[0].like_num == match
+
+
+@pytest.mark.parametrize(
+ "word", ["ventyèm", "Milyonnyèm", "3yèm", "Santyèm", "25yèm", "52yèm"]
+)
+def test_ht_lex_attrs_like_number_for_ordinal(word):
+ assert like_num(word)
+
+
+@pytest.mark.parametrize("word", ["onz"])
+def test_ht_lex_attrs_capitals(word):
+ assert like_num(word)
+ assert like_num(word.upper())
+
+
+@pytest.mark.parametrize(
+ "word, expected", [
+ ("'m", "mwen"),
+ ("'n", "nou"),
+ ("'l", "li"),
+ ("'y", "yo"),
+ ("'w", "ou"),
+ ]
+)
+def test_ht_lex_attrs_norm_custom(word, expected):
+ assert norm_custom(word) == expected
+
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index f946528ae..9818d5d7c 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -656,17 +656,12 @@ def test_spacy_blank():
@pytest.mark.parametrize(
"lang,target",
[
- ("en", "en"),
("fra", "fr"),
("fre", "fr"),
("iw", "he"),
("mo", "ro"),
+ ("scc", "sr"),
("mul", "xx"),
- ("no", "nb"),
- ("pt-BR", "pt"),
- ("xx", "xx"),
- ("zh-Hans", "zh"),
- ("zh-Hant", None),
("zxx", None),
],
)
@@ -686,11 +681,9 @@ def test_language_matching(lang, target):
("fre", "fr"),
("iw", "he"),
("mo", "ro"),
+ ("scc", "sr"),
("mul", "xx"),
- ("no", "nb"),
- ("pt-BR", "pt"),
("xx", "xx"),
- ("zh-Hans", "zh"),
],
)
def test_blank_languages(lang, target):
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 64b8d7c6c..a7faf0d62 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -479,10 +479,11 @@ cdef class Span:
break
elif i == self.doc.length - 1:
yield Span(self.doc, start, self.doc.length)
-
- # Ensure that trailing parts of the Span instance are included in last element of .sents.
- if start == self.doc.length - 1:
- yield Span(self.doc, start, self.doc.length)
+ else:
+ # Ensure that trailing parts of the Span instance are included in last element of .sents.
+ # We only want to do this if we didn't break above
+ if start == self.doc.length - 1:
+ yield Span(self.doc, start, self.doc.length)
@property
def ents(self):
diff --git a/spacy/util.py b/spacy/util.py
index f1e68696b..527e6eb3a 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -5,7 +5,6 @@ import inspect
import itertools
import logging
import os
-import pkgutil
import re
import shlex
import shutil
@@ -40,7 +39,6 @@ from typing import (
)
import catalogue
-import langcodes
import numpy
import srsly
import thinc
@@ -89,6 +87,83 @@ LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "grc", "id", "lb", "mk", "pt"
# Default order of sections in the config file. Not all sections needs to exist,
# and additional sections are added at the end, in alphabetical order.
CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"]
+
+LANG_ALIASES = {
+ "af": ["afr"],
+ "am": ["amh"],
+ "ar": ["ara"],
+ "az": ["aze"],
+ "bg": ["bul"],
+ "bn": ["ben"],
+ "bo": ["bod", "tib"],
+ "ca": ["cat"],
+ "cs": ["ces", "cze"],
+ "da": ["dan"],
+ "de": ["deu", "ger"],
+ "el": ["ell", "gre"],
+ "en": ["eng"],
+ "es": ["spa"],
+ "et": ["est"],
+ "eu": ["eus", "baq"],
+ "fa": ["fas", "per"],
+ "fi": ["fin"],
+ "fo": ["fao"],
+ "fr": ["fra", "fre"],
+ "ga": ["gle"],
+ "gd": ["gla"],
+ "gu": ["guj"],
+ "he": ["heb", "iw"], # "iw" is the obsolete ISO 639-1 code for Hebrew
+ "hi": ["hin"],
+ "hr": ["hrv", "scr"], # "scr" is the deprecated ISO 639-2/B for Croatian
+ "hu": ["hun"],
+ "hy": ["hye"],
+ "id": ["ind", "in"], # "in" is the obsolete ISO 639-1 code for Hebrew
+ "is": ["isl", "ice"],
+ "it": ["ita"],
+ "ja": ["jpn"],
+ "kn": ["kan"],
+ "ko": ["kor"],
+ "ky": ["kir"],
+ "la": ["lat"],
+ "lb": ["ltz"],
+ "lg": ["lug"],
+ "lt": ["lit"],
+ "lv": ["lav"],
+ "mk": ["mkd", "mac"],
+ "ml": ["mal"],
+ "mr": ["mar"],
+ "ms": ["msa", "may"],
+ "nb": ["nob"],
+ "ne": ["nep"],
+ "nl": ["nld", "dut"],
+ "nn": ["nno"],
+ "pl": ["pol"],
+ "pt": ["por"],
+ "ro": ["ron", "rom", "mo", "mol"], # "mo" and "mol" are deprecated codes for Moldavian
+ "ru": ["rus"],
+ "sa": ["san"],
+ "si": ["sin"],
+ "sk": ["slk", "slo"],
+ "sl": ["slv"],
+ "sq": ["sqi", "alb"],
+ "sr": ["srp", "scc"], # "scc" is the deprecated ISO 639-2/B code for Serbian
+ "sv": ["swe"],
+ "ta": ["tam"],
+ "te": ["tel"],
+ "th": ["tha"],
+ "ti": ["tir"],
+ "tl": ["tgl"],
+ "tn": ["tsn"],
+ "tr": ["tur"],
+ "tt": ["tat"],
+ "uk": ["ukr"],
+ "ur": ["urd"],
+ "vi": ["viw"],
+ "yo": ["yor"],
+ "zh": ["zho", "chi"],
+
+ "xx": ["mul"],
+}
# fmt: on
logger = logging.getLogger("spacy")
@@ -305,63 +380,39 @@ def lang_class_is_loaded(lang: str) -> bool:
def find_matching_language(lang: str) -> Optional[str]:
"""
- Given an IETF language code, find a supported spaCy language that is a
- close match for it (according to Unicode CLDR language-matching rules).
- This allows for language aliases, ISO 639-2 codes, more detailed language
- tags, and close matches.
+ Given a two-letter ISO 639-1 or three-letter ISO 639-3 language code,
+ find a supported spaCy language.
Returns the language code if a matching language is available, or None
if there is no matching language.
- >>> find_matching_language('en')
- 'en'
- >>> find_matching_language('pt-BR') # Brazilian Portuguese
- 'pt'
- >>> find_matching_language('fra') # an ISO 639-2 code for French
+ >>> find_matching_language('fra') # ISO 639-3 code for French
'fr'
- >>> find_matching_language('iw') # obsolete alias for Hebrew
+ >>> find_matching_language('fre') # ISO 639-2/B code for French
+ 'fr'
+ >>> find_matching_language('iw') # Obsolete ISO 639-1 code for Hebrew
'he'
- >>> find_matching_language('no') # Norwegian
- 'nb'
- >>> find_matching_language('mo') # old code for ro-MD
+ >>> find_matching_language('mo') # Deprecated code for Moldavian
'ro'
- >>> find_matching_language('zh-Hans') # Simplified Chinese
- 'zh'
+ >>> find_matching_language('scc') # Deprecated ISO 639-2/B code for Serbian
+ 'sr'
>>> find_matching_language('zxx')
None
"""
import spacy.lang # noqa: F401
- if lang == "xx":
- return "xx"
+ # Check aliases
+ for lang_code, aliases in LANG_ALIASES.items():
+ if lang in aliases:
+ return lang_code
- # Find out which language modules we have
- possible_languages = []
- for modinfo in pkgutil.iter_modules(spacy.lang.__path__): # type: ignore[attr-defined]
- code = modinfo.name
- if code == "xx":
- # Temporarily make 'xx' into a valid language code
- possible_languages.append("mul")
- elif langcodes.tag_is_valid(code):
- possible_languages.append(code)
-
- # Distances from 1-9 allow near misses like Bosnian -> Croatian and
- # Norwegian -> Norwegian Bokmål. A distance of 10 would include several
- # more possibilities, like variants of Chinese like 'wuu', but text that
- # is labeled that way is probably trying to be distinct from 'zh' and
- # shouldn't automatically match.
- match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9)
- if match == "mul":
- # Convert 'mul' back to spaCy's 'xx'
- return "xx"
- else:
- return match
+ return None
def get_lang_class(lang: str) -> Type["Language"]:
"""Import and load a Language class.
- lang (str): IETF language code, such as 'en'.
+ lang (str): Two-letter ISO 639-1 or three-letter ISO 639-3 language code, such as 'en' and 'eng'.
RETURNS (Language): Language class.
"""
# Check if language is registered / entry point is available
@@ -372,13 +423,9 @@ def get_lang_class(lang: str) -> Type["Language"]:
try:
module = importlib.import_module(f".lang.{lang}", "spacy")
except ImportError as err:
- # Find a matching language. For example, if the language 'no' is
- # requested, we can use language-matching to load `spacy.lang.nb`.
- try:
- match = find_matching_language(lang)
- except langcodes.tag_parser.LanguageTagError:
- # proceed to raising an import error
- match = None
+ # Find a matching language. For example, if the language 'eng' is
+ # requested, we can use language-matching to load `spacy.lang.en`.
+ match = find_matching_language(lang)
if match:
lang = match
diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index 6c47c8f1e..09a978259 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -230,7 +230,7 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
| Name | Description |
| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `lang` | Pipeline language [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as `en`. ~~str (positional)~~ |
+| `lang` | Pipeline language. Two-letter [ISO 639-1 code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3 code](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes), such as `en` and `eng`. ~~str (positional)~~ |
| `vectors_loc` | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ |
| `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ |
| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx
index b969ddc53..a1c6601ab 100644
--- a/website/docs/api/language.mdx
+++ b/website/docs/api/language.mdx
@@ -1078,7 +1078,7 @@ details.
| Name | Description |
| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `Defaults` | Settings, data and factory methods for creating the `nlp` object and processing pipeline. ~~Defaults~~ |
-| `lang` | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en' for English. ~~str~~ |
+| `lang` | Two-letter [ISO 639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes) language codes, such as 'en' and 'eng' for English. ~~str~~ |
| `default_config` | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](%%GITHUB_SPACY/spacy/default_config.cfg). ~~Config~~ |
## Defaults {id="defaults"}
diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx
index 225ff6e6a..d44015382 100644
--- a/website/docs/api/span.mdx
+++ b/website/docs/api/span.mdx
@@ -561,7 +561,7 @@ overlaps with will be returned.
| `orth_` | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes. ~~str~~ |
| `label` | The hash value of the span's label. ~~int~~ |
| `label_` | The span's label. ~~str~~ |
-| `lemma_` | The span's lemma. Equivalent to `"".join(token.text_with_ws for token in span)`. ~~str~~ |
+| `lemma_` | The span's lemma. Equivalent to `"".join(token.lemma_ + token.whitespace_ for token in span).strip()`. ~~str~~ |
| `kb_id` | The hash value of the knowledge base ID referred to by the span. ~~int~~ |
| `kb_id_` | The knowledge base ID referred to by the span. ~~str~~ |
| `ent_id` | The hash value of the named entity the root token is an instance of. ~~int~~ |
diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx
index 9cdc0c8ab..340f10f77 100644
--- a/website/docs/api/top-level.mdx
+++ b/website/docs/api/top-level.mdx
@@ -86,7 +86,7 @@ Create a blank pipeline of a given language class. This function is the twin of
| Name | Description |
| ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `name` | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en', of the language class to load. ~~str~~ |
+| `name` | Two-letter [ISO 639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes) language codes, such as 'en' and 'eng', of the language class to load. ~~str~~ |
| _keyword-only_ | |
| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ |
| `config` 3 | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
diff --git a/website/docs/api/vectors.mdx b/website/docs/api/vectors.mdx
index 0e92eb12b..da2d7831a 100644
--- a/website/docs/api/vectors.mdx
+++ b/website/docs/api/vectors.mdx
@@ -51,7 +51,7 @@ modified later.
| `strings` | The string store. A new string store is created if one is not provided. Defaults to `None`. ~~Optional[StringStore]~~ |
| `shape` | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. ~~Tuple[int, int]~~ |
| `data` | The vector data. ~~numpy.ndarray[ndim=2, dtype=float32]~~ |
-| `keys` | A sequence of keys aligned with the data. ~~Iterable[Union[str, int]]~~ |
+| `keys` | An iterable of keys aligned with the data. ~~Iterable[Union[str, int]]~~ |
| `name` | A name to identify the vectors table. ~~str~~ |
| `mode` 3.2 | Vectors mode: `"default"` or [`"floret"`](https://github.com/explosion/floret) (default: `"default"`). ~~str~~ |
| `minn` 3.2 | The floret char ngram minn (default: `0`). ~~int~~ |
diff --git a/website/docs/api/vocab.mdx b/website/docs/api/vocab.mdx
index 57618397d..2466f561b 100644
--- a/website/docs/api/vocab.mdx
+++ b/website/docs/api/vocab.mdx
@@ -283,7 +283,7 @@ Serialize the current state to a binary string.
| -------------- | ------------------------------------------------------------------------------------------- |
| _keyword-only_ | |
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
-| **RETURNS** | The serialized form of the `Vocab` object. ~~Vocab~~ |
+| **RETURNS** | The serialized form of the `Vocab` object. ~~bytes~~ |
## Vocab.from_bytes {id="from_bytes",tag="method"}