diff --git a/spacy/lang/ht/__init__.py b/spacy/lang/ht/__init__.py
new file mode 100644
index 000000000..e5c1c2770
--- /dev/null
+++ b/spacy/lang/ht/__init__.py
@@ -0,0 +1,52 @@
+from typing import Callable, Optional
+
+from thinc.api import Model
+
+from ...language import BaseDefaults, Language
+from .lemmatizer import HaitianCreoleLemmatizer
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from .tag_map import TAG_MAP
+
+
+class HaitianCreoleDefaults(BaseDefaults):
+    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+    prefixes = TOKENIZER_PREFIXES
+    infixes = TOKENIZER_INFIXES
+    suffixes = TOKENIZER_SUFFIXES
+    lex_attr_getters = LEX_ATTRS
+    syntax_iterators = SYNTAX_ITERATORS
+    stop_words = STOP_WORDS
+    tag_map = TAG_MAP
+
+class HaitianCreole(Language):
+    lang = "ht"
+    Defaults = HaitianCreoleDefaults
+
+@HaitianCreole.factory(
+    "lemmatizer",
+    assigns=["token.lemma"],
+    default_config={
+        "model": None,
+        "mode": "rule",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
+    default_score_weights={"lemma_acc": 1.0},
+)
+def make_lemmatizer(
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool,
+    scorer: Optional[Callable],
+):
+    return HaitianCreoleLemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )
+
+__all__ = ["HaitianCreole"]
diff --git a/spacy/lang/ht/examples.py b/spacy/lang/ht/examples.py
new file mode 100644
index 000000000..456d34a5f
--- /dev/null
+++ b/spacy/lang/ht/examples.py
@@ -0,0 +1,18 @@
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.ht.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple ap panse achte yon demaraj nan Wayòm Ini pou $1 milya dola",
+    "Machin otonòm fè responsablite asirans lan ale sou men fabrikan yo",
+    "San Francisco ap konsidere entèdi robo ki livre sou twotwa yo",
+    "Lond se yon gwo vil nan Wayòm Ini",
+    "Kote ou ye?",
+    "Kilès ki prezidan Lafrans?",
+    "Ki kapital Etazini?",
+    "Kile Barack Obama te fèt?",
+]
diff --git a/spacy/lang/ht/lemmatizer.py b/spacy/lang/ht/lemmatizer.py
new file mode 100644
index 000000000..9ac096f6d
--- /dev/null
+++ b/spacy/lang/ht/lemmatizer.py
@@ -0,0 +1,51 @@
+from typing import List, Tuple
+
+from ...pipeline import Lemmatizer
+from ...tokens import Token
+from ...lookups import Lookups
+
+
+class HaitianCreoleLemmatizer(Lemmatizer):
+    """
+    Minimal Haitian Creole lemmatizer.
+    Returns a word's base form based on rules and lookup,
+    or defaults to the original form.
+    """
+
+    def is_base_form(self, token: Token) -> bool:
+        morph = token.morph.to_dict()
+        upos = token.pos_.lower()
+
+        # Consider unmarked forms to be base
+        if upos in {"noun", "verb", "adj", "adv"}:
+            if not morph:
+                return True
+            if upos == "noun" and morph.get("Number") == "Sing":
+                return True
+            if upos == "verb" and morph.get("VerbForm") == "Inf":
+                return True
+            if upos == "adj" and morph.get("Degree") == "Pos":
+                return True
+        return False
+
+    def rule_lemmatize(self, token: Token) -> List[str]:
+        string = token.text.lower()
+        pos = token.pos_.lower()
+        cache_key = (token.orth, token.pos)
+        if cache_key in self.cache:
+            return self.cache[cache_key]
+
+        forms = []
+
+        # fallback rule: just return lowercased form
+        forms.append(string)
+
+        self.cache[cache_key] = forms
+        return forms
+
+    @classmethod
+    def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
+        if mode == "rule":
+            required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
+            return (required, [])
+        return super().get_lookups_config(mode)
diff --git a/spacy/lang/ht/lex_attrs.py b/spacy/lang/ht/lex_attrs.py
new file mode 100644
index 000000000..8a3ec1ff9
--- /dev/null
+++ b/spacy/lang/ht/lex_attrs.py
@@ -0,0 +1,78 @@
+from ...attrs import LIKE_NUM, NORM
+
+# Cardinal numbers in Creole
+_num_words = set(
+    """
+zewo youn en de twa kat senk sis sèt uit nèf dis
+onz douz trèz katoz kenz sèz disèt dizwit diznèf
+vent trant karant sinkant swasant swasann-dis
+san mil milyon milya
+""".split()
+)
+
+# Ordinal numbers in Creole (some are French-influenced, some simplified)
+_ordinal_words = set(
+    """
+premye dezyèm twazyèm katryèm senkyèm sizyèm sètvyèm uitvyèm nèvyèm dizyèm
+onzèm douzyèm trèzyèm katozyèm kenzèm sèzyèm disetyèm dizwityèm diznèvyèm
+ventyèm trantyèm karantyèm sinkantyèm swasantyèm
+swasann-disyèm santyèm milyèm milyonnyèm milyadyèm
+""".split()
+)
+
+NORM_MAP = {
+    "'m": "mwen",
+    "'w": "ou",
+    "'l": "li",
+    "'n": "nou",
+    "'y": "yo",
+    "’m": "mwen",
+    "’w": "ou",
+    "’l": "li",
+    "’n": "nou",
+    "’y": "yo",
+    "m": "mwen",
+    "n": "nou",
+    "l": "li",
+    "y": "yo",
+    "w": "ou",
+    "t": "te",
+    "k": "ki",
+    "p": "pa",
+    "M": "Mwen",
+    "N": "Nou",
+    "L": "Li",
+    "Y": "Yo",
+    "W": "Ou",
+    "T": "Te",
+    "K": "Ki",
+    "P": "Pa",
+}
+
+def like_num(text):
+    text = text.strip().lower()
+    if text.startswith(("+", "-", "±", "~")):
+        text = text[1:]
+    text = text.replace(",", "").replace(".", "")
+    if text.isdigit():
+        return True
+    if text.count("/") == 1:
+        num, denom = text.split("/")
+        if num.isdigit() and denom.isdigit():
+            return True
+    if text in _num_words:
+        return True
+    if text in _ordinal_words:
+        return True
+    # Handle things like "3yèm", "10yèm", "25yèm", etc.
+    if text.endswith("yèm") and text[:-3].isdigit():
+        return True
+    return False
+
+def norm_custom(text):
+    return NORM_MAP.get(text, text.lower())
+
+LEX_ATTRS = {
+    LIKE_NUM: like_num,
+    NORM: norm_custom,
+}
diff --git a/spacy/lang/ht/punctuation.py b/spacy/lang/ht/punctuation.py
new file mode 100644
index 000000000..61d88d6e1
--- /dev/null
+++ b/spacy/lang/ht/punctuation.py
@@ -0,0 +1,43 @@
+from ..char_classes import (
+    ALPHA,
+    ALPHA_LOWER,
+    ALPHA_UPPER,
+    CONCAT_QUOTES,
+    HYPHENS,
+    LIST_PUNCT,
+    LIST_QUOTES,
+    LIST_ELLIPSES,
+    LIST_ICONS,
+    merge_chars,
+)
+
+ELISION = "'’".replace(" ", "")
+
+_prefixes_elision = "m n l y t k w"
+_prefixes_elision += " " + _prefixes_elision.upper()
+
+TOKENIZER_PREFIXES = LIST_PUNCT + LIST_QUOTES + [
+    r"(?:({pe})[{el}])(?=[{a}])".format(
+        a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
+    )
+]
+
+TOKENIZER_SUFFIXES = LIST_PUNCT + LIST_QUOTES + LIST_ELLIPSES + [
+    r"(?<=[0-9])%",  # numbers like 10%
+    r"(?<=[0-9])(?:{h})".format(h=HYPHENS),  # hyphens after numbers
+    r"(?<=[{a}])['’]".format(a=ALPHA),  # apostrophes after letters
+    r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA),  # contractions
+    r"(?<=[{a}0-9])\)",  # right parenthesis after letter/number
+    r"(?<=[{a}])\.(?=\s|$)".format(a=ALPHA),  # period after letter if space or end of string
+    r"(?<=\))[\.\?!]",  # punctuation immediately after right parenthesis
+]
+
+TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [
+    r"(?<=[0-9])[+\-\*^](?=[0-9-])",
+    r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
+        al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
+    ),
+    r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+    r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
+    r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
+]
diff --git a/spacy/lang/ht/stop_words.py b/spacy/lang/ht/stop_words.py
new file mode 100644
index 000000000..6243887a4
--- /dev/null
+++ b/spacy/lang/ht/stop_words.py
@@ -0,0 +1,50 @@
+STOP_WORDS = set(
+    """
+a ak an ankò ant apre ap atò avan avanlè
+byen bò byenke
+
+chak
+
+de depi deja deja
+
+e en epi èske
+
+fò fòk
+
+gen genyen
+
+ki kisa kilès kote koukou konsa konbyen konn konnen kounye kouman
+
+la l laa le lè li lye lò
+
+m m' mwen
+
+nan nap nou n'
+
+ou oumenm
+
+pa paske pami pandan pito pou pral preske pwiske
+
+se selman si sou sòt
+
+ta tap tankou te toujou tou tan tout toutotan twòp tèl
+
+w w' wi wè
+
+y y' yo yon yonn
+
+non o oh eh
+
+sa san si swa si
+
+men mèsi oswa osinon
+
+"""
+.split()
+)
+
+# Add common contractions, with and without apostrophe variants
+contractions = ["m'", "n'", "w'", "y'", "l'", "t'", "k'"]
+for apostrophe in ["'", "’", "‘"]:
+    for word in contractions:
+        STOP_WORDS.add(word.replace("'", apostrophe))
diff --git a/spacy/lang/ht/syntax_iterators.py b/spacy/lang/ht/syntax_iterators.py
new file mode 100644
index 000000000..44ff17f74
--- /dev/null
+++ b/spacy/lang/ht/syntax_iterators.py
@@ -0,0 +1,74 @@
+from typing import Iterator, Tuple, Union
+
+from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
+from ...tokens import Doc, Span
+
+
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
+    """
+    Detect base noun phrases from a dependency parse for Haitian Creole.
+    Works on both Doc and Span objects.
+    """
+
+    # Core nominal dependencies common in Haitian Creole
+    labels = [
+        "nsubj",
+        "obj",
+        "obl",
+        "nmod",
+        "appos",
+        "ROOT",
+    ]
+
+    # Modifiers to optionally include in chunk (to the right)
+    post_modifiers = ["compound", "flat", "flat:name", "fixed"]
+
+    doc = doclike.doc
+    if not doc.has_annotation("DEP"):
+        raise ValueError(Errors.E029)
+
+    np_deps = {doc.vocab.strings.add(label) for label in labels}
+    np_mods = {doc.vocab.strings.add(mod) for mod in post_modifiers}
+    conj_label = doc.vocab.strings.add("conj")
+    np_label = doc.vocab.strings.add("NP")
+    adp_pos = doc.vocab.strings.add("ADP")
+    cc_pos = doc.vocab.strings.add("CCONJ")
+
+    prev_end = -1
+    for i, word in enumerate(doclike):
+        if word.pos not in (NOUN, PROPN, PRON):
+            continue
+        if word.left_edge.i <= prev_end:
+            continue
+
+        if word.dep in np_deps:
+            right_end = word
+            # expand to include known modifiers to the right
+            for child in word.rights:
+                if child.dep in np_mods:
+                    right_end = child.right_edge
+                elif child.pos == NOUN:
+                    right_end = child.right_edge
+
+            left_index = word.left_edge.i
+            # Skip prepositions at the start
+            if word.left_edge.pos == adp_pos:
+                left_index += 1
+
+            prev_end = right_end.i
+            yield left_index, right_end.i + 1, np_label
+
+        elif word.dep == conj_label:
+            head = word.head
+            while head.dep == conj_label and head.head.i < head.i:
+                head = head.head
+            if head.dep in np_deps:
+                left_index = word.left_edge.i
+                if word.left_edge.pos == cc_pos:
+                    left_index += 1
+                prev_end = word.i
+                yield left_index, word.i + 1, np_label
+
+
+SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/lang/ht/tag_map.py b/spacy/lang/ht/tag_map.py
new file mode 100644
index 000000000..8c9cdd6d4
--- /dev/null
+++ b/spacy/lang/ht/tag_map.py
@@ -0,0 +1,21 @@
+from spacy.symbols import NOUN, VERB, AUX, ADJ, ADV, PRON, DET, ADP, SCONJ, CCONJ, PART, INTJ, NUM, PROPN, PUNCT, SYM, X
+
+TAG_MAP = {
+    "NOUN": {"pos": NOUN},
+    "VERB": {"pos": VERB},
+    "AUX": {"pos": AUX},
+    "ADJ": {"pos": ADJ},
+    "ADV": {"pos": ADV},
+    "PRON": {"pos": PRON},
+    "DET": {"pos": DET},
+    "ADP": {"pos": ADP},
+    "SCONJ": {"pos": SCONJ},
+    "CCONJ": {"pos": CCONJ},
+    "PART": {"pos": PART},
+    "INTJ": {"pos": INTJ},
+    "NUM": {"pos": NUM},
+    "PROPN": {"pos": PROPN},
+    "PUNCT": {"pos": PUNCT},
+    "SYM": {"pos": SYM},
+    "X": {"pos": X},
+}
diff --git a/spacy/lang/ht/tokenizer_exceptions.py b/spacy/lang/ht/tokenizer_exceptions.py
new file mode 100644
index 000000000..b44ad7a6f
--- /dev/null
+++ b/spacy/lang/ht/tokenizer_exceptions.py
@@ -0,0 +1,121 @@
+from spacy.symbols import ORTH, NORM
+
+def make_variants(base, first_norm, second_orth, second_norm):
+    return {
+        base: [
+            {ORTH: base.split("'")[0] + "'", NORM: first_norm},
+            {ORTH: second_orth, NORM: second_norm},
+        ],
+        base.capitalize(): [
+            {ORTH: base.split("'")[0].capitalize() + "'", NORM: first_norm.capitalize()},
+            {ORTH: second_orth, NORM: second_norm},
+        ]
+    }
+
+TOKENIZER_EXCEPTIONS = {
+    "Dr.": [{ORTH: "Dr."}]
+}
+
+# Apostrophe forms
+TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap"))
+TOKENIZER_EXCEPTIONS.update(make_variants("n'ap", "nou", "ap", "ap"))
+TOKENIZER_EXCEPTIONS.update(make_variants("l'ap", "li", "ap", "ap"))
+TOKENIZER_EXCEPTIONS.update(make_variants("y'ap", "yo", "ap", "ap"))
+TOKENIZER_EXCEPTIONS.update(make_variants("m'te", "mwen", "te", "te"))
+TOKENIZER_EXCEPTIONS.update(make_variants("m'pral", "mwen", "pral", "pral"))
+TOKENIZER_EXCEPTIONS.update(make_variants("w'ap", "ou", "ap", "ap"))
+TOKENIZER_EXCEPTIONS.update(make_variants("k'ap", "ki", "ap", "ap"))
+TOKENIZER_EXCEPTIONS.update(make_variants("p'ap", "pa", "ap", "ap"))
+TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap"))
+
+# Non-apostrophe contractions (with capitalized variants)
+TOKENIZER_EXCEPTIONS.update({
+    "map": [
+        {ORTH: "m", NORM: "mwen"},
+        {ORTH: "ap", NORM: "ap"},
+    ],
+    "Map": [
+        {ORTH: "M", NORM: "Mwen"},
+        {ORTH: "ap", NORM: "ap"},
+    ],
+    "lem": [
+        {ORTH: "le", NORM: "le"},
+        {ORTH: "m", NORM: "mwen"},
+    ],
+    "Lem": [
+        {ORTH: "Le", NORM: "Le"},
+        {ORTH: "m", NORM: "mwen"},
+    ],
+    "lew": [
+        {ORTH: "le", NORM: "le"},
+        {ORTH: "w", NORM: "ou"},
+    ],
+    "Lew": [
+        {ORTH: "Le", NORM: "Le"},
+        {ORTH: "w", NORM: "ou"},
+    ],
+    "nap": [
+        {ORTH: "n", NORM: "nou"},
+        {ORTH: "ap", NORM: "ap"},
+    ],
+    "Nap": [
+        {ORTH: "N", NORM: "Nou"},
+        {ORTH: "ap", NORM: "ap"},
+    ],
+    "lap": [
+        {ORTH: "l", NORM: "li"},
+        {ORTH: "ap", NORM: "ap"},
+    ],
+    "Lap": [
+        {ORTH: "L", NORM: "Li"},
+        {ORTH: "ap", NORM: "ap"},
+    ],
+    "yap": [
+        {ORTH: "y", NORM: "yo"},
+        {ORTH: "ap", NORM: "ap"},
+    ],
+    "Yap": [
+        {ORTH: "Y", NORM: "Yo"},
+        {ORTH: "ap", NORM: "ap"},
+    ],
+    "mte": [
+        {ORTH: "m", NORM: "mwen"},
+        {ORTH: "te", NORM: "te"},
+    ],
+    "Mte": [
+        {ORTH: "M", NORM: "Mwen"},
+        {ORTH: "te", NORM: "te"},
+    ],
+    "mpral": [
+        {ORTH: "m", NORM: "mwen"},
+        {ORTH: "pral", NORM: "pral"},
+    ],
+    "Mpral": [
+        {ORTH: "M", NORM: "Mwen"},
+        {ORTH: "pral", NORM: "pral"},
+    ],
+    "wap": [
+        {ORTH: "w", NORM: "ou"},
+        {ORTH: "ap", NORM: "ap"},
+    ],
+    "Wap": [
+        {ORTH: "W", NORM: "Ou"},
+        {ORTH: "ap", NORM: "ap"},
+    ],
+    "kap": [
+        {ORTH: "k", NORM: "ki"},
+        {ORTH: "ap", NORM: "ap"},
+    ],
+    "Kap": [
+        {ORTH: "K", NORM: "Ki"},
+        {ORTH: "ap", NORM: "ap"},
+    ],
+    "tap": [
+        {ORTH: "t", NORM: "te"},
+        {ORTH: "ap", NORM: "ap"},
+    ],
+    "Tap": [
+        {ORTH: "T", NORM: "Te"},
+        {ORTH: "ap", NORM: "ap"},
+    ],
+})
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index e30300a33..ae5255c28 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -212,6 +212,16 @@ def hr_tokenizer():
     return get_lang_class("hr")().tokenizer
 
 
+@pytest.fixture(scope="session")
+def ht_tokenizer():
+    return get_lang_class("ht")().tokenizer
+
+
+@pytest.fixture(scope="session")
+def ht_vocab():
+    return get_lang_class("ht")().vocab
+
+
 @pytest.fixture
 def hu_tokenizer():
     return get_lang_class("hu")().tokenizer
diff --git a/spacy/tests/lang/ht/__init__.py b/spacy/tests/lang/ht/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/ht/test_exceptions.py b/spacy/tests/lang/ht/test_exceptions.py
new file mode 100644
index 000000000..685b72c07
--- /dev/null
+++ b/spacy/tests/lang/ht/test_exceptions.py
@@ -0,0 +1,32 @@
+import pytest
+
+
+def test_ht_tokenizer_handles_basic_contraction(ht_tokenizer):
+    text = "m'ap ri"
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 3
+    assert tokens[0].text == "m'"
+    assert tokens[1].text == "ap"
+    assert tokens[2].text == "ri"
+
+    text = "mwen di'w non!"
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 5
+    assert tokens[0].text == "mwen"
+    assert tokens[1].text == "di"
+    assert tokens[2].text == "'w"
+    assert tokens[3].text == "non"
+    assert tokens[4].text == "!"
+
+
+@pytest.mark.parametrize("text", ["Dr."])
+def test_ht_tokenizer_handles_basic_abbreviation(ht_tokenizer, text):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 1
+    assert tokens[0].text == text
+
+
+def test_ht_tokenizer_full_sentence(ht_tokenizer):
+    text = "Si'm ka vini, m'ap pale ak li."
+    tokens = [t.text for t in ht_tokenizer(text)]
+    assert tokens == ["Si", "'m", "ka", "vini", ",", "m'", "ap", "pale", "ak", "li", "."]
diff --git a/spacy/tests/lang/ht/test_noun_chunks.py b/spacy/tests/lang/ht/test_noun_chunks.py
new file mode 100644
index 000000000..76c5a1df3
--- /dev/null
+++ b/spacy/tests/lang/ht/test_noun_chunks.py
@@ -0,0 +1,44 @@
+import pytest
+from spacy.tokens import Doc
+
+
+@pytest.fixture
+def doc(ht_vocab):
+    words = ["Pitit", "gen", "gwo", "pwoblèm", "ak", "kontwòl"]
+    heads = [1, 1, 5, 5, 3, 3]
+    deps = ["nsubj", "ROOT", "amod", "obj", "case", "nmod"]
+    pos = ["NOUN", "VERB", "ADJ", "NOUN", "ADP", "NOUN"]
+    return Doc(ht_vocab, words=words, heads=heads, deps=deps, pos=pos)
+
+
+def test_noun_chunks_is_parsed(ht_tokenizer):
+    """Test that noun_chunks raises Value Error for 'ht' language if Doc is not parsed."""
+    doc = ht_tokenizer("Sa a se yon fraz")
+    with pytest.raises(ValueError):
+        list(doc.noun_chunks)
+
+
+def test_ht_noun_chunks_not_nested(doc, ht_vocab):
+    """Test that each token only appears in one noun chunk at most"""
+    word_occurred = {}
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) > 1
+    for chunk in chunks:
+        for word in chunk:
+            word_occurred.setdefault(word.text, 0)
+            word_occurred[word.text] += 1
+    assert len(word_occurred) > 0
+    for word, freq in word_occurred.items():
+        assert freq == 1, (word, [chunk.text for chunk in doc.noun_chunks])
+
+
+def test_noun_chunks_span(doc, ht_tokenizer):
+    """Test that the span.noun_chunks property works correctly"""
+    doc_chunks = list(doc.noun_chunks)
+    span = doc[0:3]
+    span_chunks = list(span.noun_chunks)
+    assert 0 < len(span_chunks) < len(doc_chunks)
+    for chunk in span_chunks:
+        assert chunk in doc_chunks
+        assert chunk.start >= 0
+        assert chunk.end <= 3
diff --git a/spacy/tests/lang/ht/test_prefix_suffix_infix.py b/spacy/tests/lang/ht/test_prefix_suffix_infix.py
new file mode 100644
index 000000000..7dabec17a
--- /dev/null
+++ b/spacy/tests/lang/ht/test_prefix_suffix_infix.py
@@ -0,0 +1,130 @@
+import pytest
+
+
+@pytest.mark.parametrize("text", ["(ka)"])
+def test_ht_tokenizer_splits_no_special(ht_tokenizer, text):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize("text", ["m'ap"])
+def test_ht_tokenizer_splits_no_punct(ht_tokenizer, text):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 2
+
+
+@pytest.mark.parametrize("text", ["(m'ap"])
+def test_ht_tokenizer_splits_prefix_punct(ht_tokenizer, text):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize("text", ["m'ap)"])
+def test_ht_tokenizer_splits_suffix_punct(ht_tokenizer, text):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize("text", ["(m'ap)"])
+def test_ht_tokenizer_splits_even_wrap(ht_tokenizer, text):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 4
+
+
+@pytest.mark.parametrize("text", ["(m'ap?)"])
+def test_ht_tokenizer_splits_uneven_wrap(ht_tokenizer, text):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 5
+
+
+@pytest.mark.parametrize("text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)])
+def test_ht_tokenizer_splits_prefix_interact(ht_tokenizer, text, length):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == length
+
+
+@pytest.mark.parametrize("text", ["Ozetazini.)"])
+def test_ht_tokenizer_splits_suffix_interact(ht_tokenizer, text):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize("text", ["(Ozetazini.)"])
+def test_ht_tokenizer_splits_even_wrap_interact(ht_tokenizer, text):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 4
+
+
+@pytest.mark.parametrize("text", ["(Ozetazini?)"])
+def test_ht_tokenizer_splits_uneven_wrap_interact(ht_tokenizer, text):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 4
+
+
+@pytest.mark.parametrize("text", ["pi-bon"])
+def test_ht_tokenizer_splits_hyphens(ht_tokenizer, text):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize("text", ["0.1-13.5", "0.0-0.1", "103.27-300"])
+def test_ht_tokenizer_splits_numeric_range(ht_tokenizer, text):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize("text", ["pi.Bon", "Bon.Jour"])
+def test_ht_tokenizer_splits_period_infix(ht_tokenizer, text):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize("text", ["Bonjou,moun", "youn,de"])
+def test_ht_tokenizer_splits_comma_infix(ht_tokenizer, text):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 3
+    assert tokens[0].text == text.split(",")[0]
+    assert tokens[1].text == ","
+    assert tokens[2].text == text.split(",")[1]
+
+
+@pytest.mark.parametrize("text", ["pi...Bon", "pi...bon"])
+def test_ht_tokenizer_splits_ellipsis_infix(ht_tokenizer, text):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 3
+
+
+def test_ht_tokenizer_splits_double_hyphen_infix(ht_tokenizer):
+    tokens = ht_tokenizer("Pa vrè--men ou konnen--mwen renmen w.")
+    assert tokens[0].text == "Pa"
+    assert tokens[1].text == "vrè"
+    assert tokens[2].text == "--"
+    assert tokens[3].text == "men"
+    assert tokens[4].text == "ou"
+    assert tokens[5].text == "konnen"
+    assert tokens[6].text == "--"
+    assert tokens[7].text == "mwen"
+    assert tokens[8].text == "renmen"
+    assert tokens[9].text == "w"
+    assert tokens[10].text == "."
+
+
+def test_ht_tokenizer_splits_period_abbr(ht_tokenizer):
+    text = "Jodi a se Madi.Mr."
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 7
+    assert tokens[0].text == "Jodi"
+    assert tokens[1].text == "a"
+    assert tokens[2].text == "se"
+    assert tokens[3].text == "Madi"
+    assert tokens[4].text == "."
+    assert tokens[5].text == "Mr"
+    assert tokens[6].text == "."
+
+
+def test_ht_tokenizer_splits_paren_period(ht_tokenizer):
+    tokens = ht_tokenizer("M ap teste sa (pou kounye a).")
+    words = [t.text for t in tokens]
+    assert "a" in words
+    assert ")" in words
+    assert "." in words
diff --git a/spacy/tests/lang/ht/test_text.py b/spacy/tests/lang/ht/test_text.py
new file mode 100644
index 000000000..f396e352a
--- /dev/null
+++ b/spacy/tests/lang/ht/test_text.py
@@ -0,0 +1,79 @@
+import pytest
+
+from spacy.lang.ht.lex_attrs import like_num, norm_custom
+
+
+def test_ht_tokenizer_handles_long_text(ht_tokenizer):
+    text = """Onè ap fèt pou ansyen lidè Pati Travayè Britanik
+
+Moun atravè lemond ap voye onè pou ansyen lidè
+Pati Travayè a, John Smith, ki mouri pi bonè jodi a apre li te fè yon gwo kriz kadyak a laj 55 an.
+
+Nan Washington, Depatman Deta Etazini pibliye yon deklarasyon ki eksprime "regre lanmò twò bonè" avoka ak palmantè eskoze a.
+
+"Misye Smith, pandan tout karyè li ki te make ak distenksyon"""
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 84
+
+
+
+@pytest.mark.parametrize(
+    "text,length",
+    [
+        ("Map manje gato a pandan map gade televizyon lem lakay mwen.", 15),
+        ("M'ap vini, eske wap la avek lajan'm? Si oui, di'l non pou fre'w.", 22),
+        ("M ap teste sa (pou kounye a).", 10),
+    ],
+)
+def test_ht_tokenizer_handles_cnts(ht_tokenizer, text, length):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == length
+
+
+@pytest.mark.parametrize(
+    "text,match",
+    [
+        ("10", True),
+        ("1", True),
+        ("10,000", True),
+        ("10,00", True),
+        ("999.0", True),
+        ("en", True),
+        ("de", True),
+        ("milya", True),
+        ("dog", False),
+        (",", False),
+        ("1/2", True),
+    ],
+)
+def test_lex_attrs_like_number(ht_tokenizer, text, match):
+    tokens = ht_tokenizer(text)
+    assert len(tokens) == 1
+    assert tokens[0].like_num == match
+
+
+@pytest.mark.parametrize(
+    "word", ["ventyèm", "Milyonnyèm", "3yèm", "Santyèm", "25yèm", "52yèm"]
+)
+def test_ht_lex_attrs_like_number_for_ordinal(word):
+    assert like_num(word)
+
+
+@pytest.mark.parametrize("word", ["onz"])
+def test_ht_lex_attrs_capitals(word):
+    assert like_num(word)
+    assert like_num(word.upper())
+
+
+@pytest.mark.parametrize(
+    "word, expected", [
+        ("'m", "mwen"),
+        ("'n", "nou"),
+        ("'l", "li"),
+        ("'y", "yo"),
+        ("'w", "ou"),
+    ]
+)
+def test_ht_lex_attrs_norm_custom(word, expected):
+    assert norm_custom(word) == expected
+