diff --git a/requirements.txt b/requirements.txt index e55d25a19..7fc8ab32e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,7 +17,6 @@ requests>=2.13.0,<3.0.0 tqdm>=4.38.0,<5.0.0 pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0 jinja2 -langcodes>=3.2.0,<4.0.0 # Official Python utilities setuptools packaging>=20.0 diff --git a/setup.cfg b/setup.cfg index f00b5408e..f4d50d424 100644 --- a/setup.cfg +++ b/setup.cfg @@ -65,7 +65,6 @@ install_requires = # Official Python utilities setuptools packaging>=20.0 - langcodes>=3.2.0,<4.0.0 [options.entry_points] console_scripts = diff --git a/spacy/lang/ht/__init__.py b/spacy/lang/ht/__init__.py new file mode 100644 index 000000000..e5c1c2770 --- /dev/null +++ b/spacy/lang/ht/__init__.py @@ -0,0 +1,52 @@ +from typing import Callable, Optional + +from thinc.api import Model + +from ...language import BaseDefaults, Language +from .lemmatizer import HaitianCreoleLemmatizer +from .lex_attrs import LEX_ATTRS +from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES, TOKENIZER_SUFFIXES +from .stop_words import STOP_WORDS +from .syntax_iterators import SYNTAX_ITERATORS +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .tag_map import TAG_MAP + + +class HaitianCreoleDefaults(BaseDefaults): + tokenizer_exceptions = TOKENIZER_EXCEPTIONS + prefixes = TOKENIZER_PREFIXES + infixes = TOKENIZER_INFIXES + suffixes = TOKENIZER_SUFFIXES + lex_attr_getters = LEX_ATTRS + syntax_iterators = SYNTAX_ITERATORS + stop_words = STOP_WORDS + tag_map = TAG_MAP + +class HaitianCreole(Language): + lang = "ht" + Defaults = HaitianCreoleDefaults + +@HaitianCreole.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={ + "model": None, + "mode": "rule", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, + default_score_weights={"lemma_acc": 1.0}, +) +def make_lemmatizer( + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + overwrite: bool, + scorer: Optional[Callable], +): + return HaitianCreoleLemmatizer( + nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) + +__all__ = ["HaitianCreole"] diff --git a/spacy/lang/ht/examples.py b/spacy/lang/ht/examples.py new file mode 100644 index 000000000..456d34a5f --- /dev/null +++ b/spacy/lang/ht/examples.py @@ -0,0 +1,18 @@ +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.ht.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Apple ap panse achte yon demaraj nan Wayòm Ini pou $1 milya dola", + "Machin otonòm fè responsablite asirans lan ale sou men fabrikan yo", + "San Francisco ap konsidere entèdi robo ki livre sou twotwa yo", + "Lond se yon gwo vil nan Wayòm Ini", + "Kote ou ye?", + "Kilès ki prezidan Lafrans?", + "Ki kapital Etazini?", + "Kile Barack Obama te fèt?", +] diff --git a/spacy/lang/ht/lemmatizer.py b/spacy/lang/ht/lemmatizer.py new file mode 100644 index 000000000..9ac096f6d --- /dev/null +++ b/spacy/lang/ht/lemmatizer.py @@ -0,0 +1,51 @@ +from typing import List, Tuple + +from ...pipeline import Lemmatizer +from ...tokens import Token +from ...lookups import Lookups + + +class HaitianCreoleLemmatizer(Lemmatizer): + """ + Minimal Haitian Creole lemmatizer. + Returns a word's base form based on rules and lookup, + or defaults to the original form. + """ + + def is_base_form(self, token: Token) -> bool: + morph = token.morph.to_dict() + upos = token.pos_.lower() + + # Consider unmarked forms to be base + if upos in {"noun", "verb", "adj", "adv"}: + if not morph: + return True + if upos == "noun" and morph.get("Number") == "Sing": + return True + if upos == "verb" and morph.get("VerbForm") == "Inf": + return True + if upos == "adj" and morph.get("Degree") == "Pos": + return True + return False + + def rule_lemmatize(self, token: Token) -> List[str]: + string = token.text.lower() + pos = token.pos_.lower() + cache_key = (token.orth, token.pos) + if cache_key in self.cache: + return self.cache[cache_key] + + forms = [] + + # fallback rule: just return lowercased form + forms.append(string) + + self.cache[cache_key] = forms + return forms + + @classmethod + def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]: + if mode == "rule": + required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"] + return (required, []) + return super().get_lookups_config(mode) diff --git a/spacy/lang/ht/lex_attrs.py b/spacy/lang/ht/lex_attrs.py new file mode 100644 index 000000000..8a3ec1ff9 --- /dev/null +++ b/spacy/lang/ht/lex_attrs.py @@ -0,0 +1,78 @@ +from ...attrs import LIKE_NUM, NORM + +# Cardinal numbers in Creole +_num_words = set( + """ +zewo youn en de twa kat senk sis sèt uit nèf dis +onz douz trèz katoz kenz sèz disèt dizwit diznèf +vent trant karant sinkant swasant swasann-dis +san mil milyon milya +""".split() +) + +# Ordinal numbers in Creole (some are French-influenced, some simplified) +_ordinal_words = set( + """ +premye dezyèm twazyèm katryèm senkyèm sizyèm sètvyèm uitvyèm nèvyèm dizyèm +onzèm douzyèm trèzyèm katozyèm kenzèm sèzyèm disetyèm dizwityèm diznèvyèm +ventyèm trantyèm karantyèm sinkantyèm swasantyèm +swasann-disyèm santyèm milyèm milyonnyèm milyadyèm +""".split() +) + +NORM_MAP = { + "'m": "mwen", + "'w": "ou", + "'l": "li", + "'n": "nou", + "'y": "yo", + "’m": "mwen", + "’w": "ou", + "’l": "li", + "’n": "nou", + "’y": "yo", + "m": "mwen", + "n": "nou", + "l": "li", + "y": "yo", + "w": "ou", + "t": "te", + "k": "ki", + "p": "pa", + "M": "Mwen", + "N": "Nou", + "L": "Li", + "Y": "Yo", + "W": "Ou", + "T": "Te", + "K": "Ki", + "P": "Pa", +} + +def like_num(text): + text = text.strip().lower() + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + if text in _num_words: + return True + if text in _ordinal_words: + return True + # Handle things like "3yèm", "10yèm", "25yèm", etc. + if text.endswith("yèm") and text[:-3].isdigit(): + return True + return False + +def norm_custom(text): + return NORM_MAP.get(text, text.lower()) + +LEX_ATTRS = { + LIKE_NUM: like_num, + NORM: norm_custom, +} diff --git a/spacy/lang/ht/punctuation.py b/spacy/lang/ht/punctuation.py new file mode 100644 index 000000000..61d88d6e1 --- /dev/null +++ b/spacy/lang/ht/punctuation.py @@ -0,0 +1,43 @@ +from ..char_classes import ( + ALPHA, + ALPHA_LOWER, + ALPHA_UPPER, + CONCAT_QUOTES, + HYPHENS, + LIST_PUNCT, + LIST_QUOTES, + LIST_ELLIPSES, + LIST_ICONS, + merge_chars, +) + +ELISION = "'’".replace(" ", "") + +_prefixes_elision = "m n l y t k w" +_prefixes_elision += " " + _prefixes_elision.upper() + +TOKENIZER_PREFIXES = LIST_PUNCT + LIST_QUOTES + [ + r"(?:({pe})[{el}])(?=[{a}])".format( + a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision) + ) +] + +TOKENIZER_SUFFIXES = LIST_PUNCT + LIST_QUOTES + LIST_ELLIPSES + [ + r"(?<=[0-9])%", # numbers like 10% + r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers + r"(?<=[{a}])['’]".format(a=ALPHA), # apostrophes after letters + r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions + r"(?<=[{a}0-9])\)", # right parenthesis after letter/number + r"(?<=[{a}])\.(?=\s|$)".format(a=ALPHA), # period after letter if space or end of string + r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis +] + +TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [ + r"(?<=[0-9])[+\-\*^](?=[0-9-])", + r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( + al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES + ), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), + r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION), +] diff --git a/spacy/lang/ht/stop_words.py b/spacy/lang/ht/stop_words.py new file mode 100644 index 000000000..6243887a4 --- /dev/null +++ b/spacy/lang/ht/stop_words.py @@ -0,0 +1,50 @@ +STOP_WORDS = set( + """ +a ak an ankò ant apre ap atò avan avanlè +byen bò byenke + +chak + +de depi deja deja + +e en epi èske + +fò fòk + +gen genyen + +ki kisa kilès kote koukou konsa konbyen konn konnen kounye kouman + +la l laa le lè li lye lò + +m m' mwen + +nan nap nou n' + +ou oumenm + +pa paske pami pandan pito pou pral preske pwiske + +se selman si sou sòt + +ta tap tankou te toujou tou tan tout toutotan twòp tèl + +w w' wi wè + +y y' yo yon yonn + +non o oh eh + +sa san si swa si + +men mèsi oswa osinon + +""" +.split() +) + +# Add common contractions, with and without apostrophe variants +contractions = ["m'", "n'", "w'", "y'", "l'", "t'", "k'"] +for apostrophe in ["'", "’", "‘"]: + for word in contractions: + STOP_WORDS.add(word.replace("'", apostrophe)) diff --git a/spacy/lang/ht/syntax_iterators.py b/spacy/lang/ht/syntax_iterators.py new file mode 100644 index 000000000..44ff17f74 --- /dev/null +++ b/spacy/lang/ht/syntax_iterators.py @@ -0,0 +1,74 @@ +from typing import Iterator, Tuple, Union + +from ...errors import Errors +from ...symbols import NOUN, PRON, PROPN +from ...tokens import Doc, Span + + +def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: + """ + Detect base noun phrases from a dependency parse for Haitian Creole. + Works on both Doc and Span objects. + """ + + # Core nominal dependencies common in Haitian Creole + labels = [ + "nsubj", + "obj", + "obl", + "nmod", + "appos", + "ROOT", + ] + + # Modifiers to optionally include in chunk (to the right) + post_modifiers = ["compound", "flat", "flat:name", "fixed"] + + doc = doclike.doc + if not doc.has_annotation("DEP"): + raise ValueError(Errors.E029) + + np_deps = {doc.vocab.strings.add(label) for label in labels} + np_mods = {doc.vocab.strings.add(mod) for mod in post_modifiers} + conj_label = doc.vocab.strings.add("conj") + np_label = doc.vocab.strings.add("NP") + adp_pos = doc.vocab.strings.add("ADP") + cc_pos = doc.vocab.strings.add("CCONJ") + + prev_end = -1 + for i, word in enumerate(doclike): + if word.pos not in (NOUN, PROPN, PRON): + continue + if word.left_edge.i <= prev_end: + continue + + if word.dep in np_deps: + right_end = word + # expand to include known modifiers to the right + for child in word.rights: + if child.dep in np_mods: + right_end = child.right_edge + elif child.pos == NOUN: + right_end = child.right_edge + + left_index = word.left_edge.i + # Skip prepositions at the start + if word.left_edge.pos == adp_pos: + left_index += 1 + + prev_end = right_end.i + yield left_index, right_end.i + 1, np_label + + elif word.dep == conj_label: + head = word.head + while head.dep == conj_label and head.head.i < head.i: + head = head.head + if head.dep in np_deps: + left_index = word.left_edge.i + if word.left_edge.pos == cc_pos: + left_index += 1 + prev_end = word.i + yield left_index, word.i + 1, np_label + + +SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} diff --git a/spacy/lang/ht/tag_map.py b/spacy/lang/ht/tag_map.py new file mode 100644 index 000000000..8c9cdd6d4 --- /dev/null +++ b/spacy/lang/ht/tag_map.py @@ -0,0 +1,21 @@ +from spacy.symbols import NOUN, VERB, AUX, ADJ, ADV, PRON, DET, ADP, SCONJ, CCONJ, PART, INTJ, NUM, PROPN, PUNCT, SYM, X + +TAG_MAP = { + "NOUN": {"pos": NOUN}, + "VERB": {"pos": VERB}, + "AUX": {"pos": AUX}, + "ADJ": {"pos": ADJ}, + "ADV": {"pos": ADV}, + "PRON": {"pos": PRON}, + "DET": {"pos": DET}, + "ADP": {"pos": ADP}, + "SCONJ": {"pos": SCONJ}, + "CCONJ": {"pos": CCONJ}, + "PART": {"pos": PART}, + "INTJ": {"pos": INTJ}, + "NUM": {"pos": NUM}, + "PROPN": {"pos": PROPN}, + "PUNCT": {"pos": PUNCT}, + "SYM": {"pos": SYM}, + "X": {"pos": X}, +} diff --git a/spacy/lang/ht/tokenizer_exceptions.py b/spacy/lang/ht/tokenizer_exceptions.py new file mode 100644 index 000000000..b44ad7a6f --- /dev/null +++ b/spacy/lang/ht/tokenizer_exceptions.py @@ -0,0 +1,121 @@ +from spacy.symbols import ORTH, NORM + +def make_variants(base, first_norm, second_orth, second_norm): + return { + base: [ + {ORTH: base.split("'")[0] + "'", NORM: first_norm}, + {ORTH: second_orth, NORM: second_norm}, + ], + base.capitalize(): [ + {ORTH: base.split("'")[0].capitalize() + "'", NORM: first_norm.capitalize()}, + {ORTH: second_orth, NORM: second_norm}, + ] + } + +TOKENIZER_EXCEPTIONS = { + "Dr.": [{ORTH: "Dr."}] +} + +# Apostrophe forms +TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap")) +TOKENIZER_EXCEPTIONS.update(make_variants("n'ap", "nou", "ap", "ap")) +TOKENIZER_EXCEPTIONS.update(make_variants("l'ap", "li", "ap", "ap")) +TOKENIZER_EXCEPTIONS.update(make_variants("y'ap", "yo", "ap", "ap")) +TOKENIZER_EXCEPTIONS.update(make_variants("m'te", "mwen", "te", "te")) +TOKENIZER_EXCEPTIONS.update(make_variants("m'pral", "mwen", "pral", "pral")) +TOKENIZER_EXCEPTIONS.update(make_variants("w'ap", "ou", "ap", "ap")) +TOKENIZER_EXCEPTIONS.update(make_variants("k'ap", "ki", "ap", "ap")) +TOKENIZER_EXCEPTIONS.update(make_variants("p'ap", "pa", "ap", "ap")) +TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap")) + +# Non-apostrophe contractions (with capitalized variants) +TOKENIZER_EXCEPTIONS.update({ + "map": [ + {ORTH: "m", NORM: "mwen"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Map": [ + {ORTH: "M", NORM: "Mwen"}, + {ORTH: "ap", NORM: "ap"}, + ], + "lem": [ + {ORTH: "le", NORM: "le"}, + {ORTH: "m", NORM: "mwen"}, + ], + "Lem": [ + {ORTH: "Le", NORM: "Le"}, + {ORTH: "m", NORM: "mwen"}, + ], + "lew": [ + {ORTH: "le", NORM: "le"}, + {ORTH: "w", NORM: "ou"}, + ], + "Lew": [ + {ORTH: "Le", NORM: "Le"}, + {ORTH: "w", NORM: "ou"}, + ], + "nap": [ + {ORTH: "n", NORM: "nou"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Nap": [ + {ORTH: "N", NORM: "Nou"}, + {ORTH: "ap", NORM: "ap"}, + ], + "lap": [ + {ORTH: "l", NORM: "li"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Lap": [ + {ORTH: "L", NORM: "Li"}, + {ORTH: "ap", NORM: "ap"}, + ], + "yap": [ + {ORTH: "y", NORM: "yo"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Yap": [ + {ORTH: "Y", NORM: "Yo"}, + {ORTH: "ap", NORM: "ap"}, + ], + "mte": [ + {ORTH: "m", NORM: "mwen"}, + {ORTH: "te", NORM: "te"}, + ], + "Mte": [ + {ORTH: "M", NORM: "Mwen"}, + {ORTH: "te", NORM: "te"}, + ], + "mpral": [ + {ORTH: "m", NORM: "mwen"}, + {ORTH: "pral", NORM: "pral"}, + ], + "Mpral": [ + {ORTH: "M", NORM: "Mwen"}, + {ORTH: "pral", NORM: "pral"}, + ], + "wap": [ + {ORTH: "w", NORM: "ou"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Wap": [ + {ORTH: "W", NORM: "Ou"}, + {ORTH: "ap", NORM: "ap"}, + ], + "kap": [ + {ORTH: "k", NORM: "ki"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Kap": [ + {ORTH: "K", NORM: "Ki"}, + {ORTH: "ap", NORM: "ap"}, + ], + "tap": [ + {ORTH: "t", NORM: "te"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Tap": [ + {ORTH: "T", NORM: "Te"}, + {ORTH: "ap", NORM: "ap"}, + ], +}) diff --git a/spacy/language.py b/spacy/language.py index 9cdd724f5..5b9eb8bd2 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -141,7 +141,7 @@ class Language: Defaults (class): Settings, data and factory methods for creating the `nlp` object and processing pipeline. - lang (str): IETF language code, such as 'en'. + lang (str): Two-letter ISO 639-1 or three-letter ISO 639-3 language codes, such as 'en' and 'eng'. DOCS: https://spacy.io/api/language """ diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index e30300a33..ae5255c28 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -212,6 +212,16 @@ def hr_tokenizer(): return get_lang_class("hr")().tokenizer +@pytest.fixture(scope="session") +def ht_tokenizer(): + return get_lang_class("ht")().tokenizer + + +@pytest.fixture(scope="session") +def ht_vocab(): + return get_lang_class("ht")().vocab + + @pytest.fixture def hu_tokenizer(): return get_lang_class("hu")().tokenizer diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index 98a74bc21..7167b68ac 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -49,7 +49,7 @@ def doc_not_parsed(en_tokenizer): def test_issue1537(): """Test that Span.as_doc() doesn't segfault.""" string = "The sky is blue . The man is pink . The dog is purple ." - doc = Doc(Vocab(), words=string.split()) + doc = Doc(Vocab(), words=list(string.split())) doc[0].sent_start = True for word in doc[1:]: if word.nbor(-1).text == ".": @@ -225,6 +225,21 @@ def test_spans_span_sent(doc, doc_not_parsed): assert doc_not_parsed[10:14].sent == doc_not_parsed[5:] +def test_issue13769(): + # Test issue 13769: Incorrect output of span.sents when final token is a sentence outside of the span. + doc = Doc( + Vocab(), + words=list("This is a sentence . This is another sentence . Third".split()), + ) + doc[0].is_sent_start = True + doc[5].is_sent_start = True + doc[10].is_sent_start = True + doc.ents = [("ENTITY", 7, 9)] # "another sentence" phrase in the second sentence + entity = doc.ents[0] + ent_sents = list(entity.sents) + assert len(ent_sents) == 1 + + @pytest.mark.parametrize( "start,end,expected_sentence", [ diff --git a/spacy/tests/lang/ht/__init__.py b/spacy/tests/lang/ht/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/ht/test_exceptions.py b/spacy/tests/lang/ht/test_exceptions.py new file mode 100644 index 000000000..685b72c07 --- /dev/null +++ b/spacy/tests/lang/ht/test_exceptions.py @@ -0,0 +1,32 @@ +import pytest + + +def test_ht_tokenizer_handles_basic_contraction(ht_tokenizer): + text = "m'ap ri" + tokens = ht_tokenizer(text) + assert len(tokens) == 3 + assert tokens[0].text == "m'" + assert tokens[1].text == "ap" + assert tokens[2].text == "ri" + + text = "mwen di'w non!" + tokens = ht_tokenizer(text) + assert len(tokens) == 5 + assert tokens[0].text == "mwen" + assert tokens[1].text == "di" + assert tokens[2].text == "'w" + assert tokens[3].text == "non" + assert tokens[4].text == "!" + + +@pytest.mark.parametrize("text", ["Dr."]) +def test_ht_tokenizer_handles_basic_abbreviation(ht_tokenizer, text): + tokens = ht_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].text == text + + +def test_ht_tokenizer_full_sentence(ht_tokenizer): + text = "Si'm ka vini, m'ap pale ak li." + tokens = [t.text for t in ht_tokenizer(text)] + assert tokens == ["Si", "'m", "ka", "vini", ",", "m'", "ap", "pale", "ak", "li", "."] diff --git a/spacy/tests/lang/ht/test_noun_chunks.py b/spacy/tests/lang/ht/test_noun_chunks.py new file mode 100644 index 000000000..76c5a1df3 --- /dev/null +++ b/spacy/tests/lang/ht/test_noun_chunks.py @@ -0,0 +1,44 @@ +import pytest +from spacy.tokens import Doc + + +@pytest.fixture +def doc(ht_vocab): + words = ["Pitit", "gen", "gwo", "pwoblèm", "ak", "kontwòl"] + heads = [1, 1, 5, 5, 3, 3] + deps = ["nsubj", "ROOT", "amod", "obj", "case", "nmod"] + pos = ["NOUN", "VERB", "ADJ", "NOUN", "ADP", "NOUN"] + return Doc(ht_vocab, words=words, heads=heads, deps=deps, pos=pos) + + +def test_noun_chunks_is_parsed(ht_tokenizer): + """Test that noun_chunks raises Value Error for 'ht' language if Doc is not parsed.""" + doc = ht_tokenizer("Sa a se yon fraz") + with pytest.raises(ValueError): + list(doc.noun_chunks) + + +def test_ht_noun_chunks_not_nested(doc, ht_vocab): + """Test that each token only appears in one noun chunk at most""" + word_occurred = {} + chunks = list(doc.noun_chunks) + assert len(chunks) > 1 + for chunk in chunks: + for word in chunk: + word_occurred.setdefault(word.text, 0) + word_occurred[word.text] += 1 + assert len(word_occurred) > 0 + for word, freq in word_occurred.items(): + assert freq == 1, (word, [chunk.text for chunk in doc.noun_chunks]) + + +def test_noun_chunks_span(doc, ht_tokenizer): + """Test that the span.noun_chunks property works correctly""" + doc_chunks = list(doc.noun_chunks) + span = doc[0:3] + span_chunks = list(span.noun_chunks) + assert 0 < len(span_chunks) < len(doc_chunks) + for chunk in span_chunks: + assert chunk in doc_chunks + assert chunk.start >= 0 + assert chunk.end <= 3 diff --git a/spacy/tests/lang/ht/test_prefix_suffix_infix.py b/spacy/tests/lang/ht/test_prefix_suffix_infix.py new file mode 100644 index 000000000..7dabec17a --- /dev/null +++ b/spacy/tests/lang/ht/test_prefix_suffix_infix.py @@ -0,0 +1,130 @@ +import pytest + + +@pytest.mark.parametrize("text", ["(ka)"]) +def test_ht_tokenizer_splits_no_special(ht_tokenizer, text): + tokens = ht_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize("text", ["m'ap"]) +def test_ht_tokenizer_splits_no_punct(ht_tokenizer, text): + tokens = ht_tokenizer(text) + assert len(tokens) == 2 + + +@pytest.mark.parametrize("text", ["(m'ap"]) +def test_ht_tokenizer_splits_prefix_punct(ht_tokenizer, text): + tokens = ht_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize("text", ["m'ap)"]) +def test_ht_tokenizer_splits_suffix_punct(ht_tokenizer, text): + tokens = ht_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize("text", ["(m'ap)"]) +def test_ht_tokenizer_splits_even_wrap(ht_tokenizer, text): + tokens = ht_tokenizer(text) + assert len(tokens) == 4 + + +@pytest.mark.parametrize("text", ["(m'ap?)"]) +def test_ht_tokenizer_splits_uneven_wrap(ht_tokenizer, text): + tokens = ht_tokenizer(text) + assert len(tokens) == 5 + + +@pytest.mark.parametrize("text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)]) +def test_ht_tokenizer_splits_prefix_interact(ht_tokenizer, text, length): + tokens = ht_tokenizer(text) + assert len(tokens) == length + + +@pytest.mark.parametrize("text", ["Ozetazini.)"]) +def test_ht_tokenizer_splits_suffix_interact(ht_tokenizer, text): + tokens = ht_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize("text", ["(Ozetazini.)"]) +def test_ht_tokenizer_splits_even_wrap_interact(ht_tokenizer, text): + tokens = ht_tokenizer(text) + assert len(tokens) == 4 + + +@pytest.mark.parametrize("text", ["(Ozetazini?)"]) +def test_ht_tokenizer_splits_uneven_wrap_interact(ht_tokenizer, text): + tokens = ht_tokenizer(text) + assert len(tokens) == 4 + + +@pytest.mark.parametrize("text", ["pi-bon"]) +def test_ht_tokenizer_splits_hyphens(ht_tokenizer, text): + tokens = ht_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize("text", ["0.1-13.5", "0.0-0.1", "103.27-300"]) +def test_ht_tokenizer_splits_numeric_range(ht_tokenizer, text): + tokens = ht_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize("text", ["pi.Bon", "Bon.Jour"]) +def test_ht_tokenizer_splits_period_infix(ht_tokenizer, text): + tokens = ht_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize("text", ["Bonjou,moun", "youn,de"]) +def test_ht_tokenizer_splits_comma_infix(ht_tokenizer, text): + tokens = ht_tokenizer(text) + assert len(tokens) == 3 + assert tokens[0].text == text.split(",")[0] + assert tokens[1].text == "," + assert tokens[2].text == text.split(",")[1] + + +@pytest.mark.parametrize("text", ["pi...Bon", "pi...bon"]) +def test_ht_tokenizer_splits_ellipsis_infix(ht_tokenizer, text): + tokens = ht_tokenizer(text) + assert len(tokens) == 3 + + +def test_ht_tokenizer_splits_double_hyphen_infix(ht_tokenizer): + tokens = ht_tokenizer("Pa vrè--men ou konnen--mwen renmen w.") + assert tokens[0].text == "Pa" + assert tokens[1].text == "vrè" + assert tokens[2].text == "--" + assert tokens[3].text == "men" + assert tokens[4].text == "ou" + assert tokens[5].text == "konnen" + assert tokens[6].text == "--" + assert tokens[7].text == "mwen" + assert tokens[8].text == "renmen" + assert tokens[9].text == "w" + assert tokens[10].text == "." + + +def test_ht_tokenizer_splits_period_abbr(ht_tokenizer): + text = "Jodi a se Madi.Mr." + tokens = ht_tokenizer(text) + assert len(tokens) == 7 + assert tokens[0].text == "Jodi" + assert tokens[1].text == "a" + assert tokens[2].text == "se" + assert tokens[3].text == "Madi" + assert tokens[4].text == "." + assert tokens[5].text == "Mr" + assert tokens[6].text == "." + + +def test_ht_tokenizer_splits_paren_period(ht_tokenizer): + tokens = ht_tokenizer("M ap teste sa (pou kounye a).") + words = [t.text for t in tokens] + assert "a" in words + assert ")" in words + assert "." in words diff --git a/spacy/tests/lang/ht/test_text.py b/spacy/tests/lang/ht/test_text.py new file mode 100644 index 000000000..f396e352a --- /dev/null +++ b/spacy/tests/lang/ht/test_text.py @@ -0,0 +1,79 @@ +import pytest + +from spacy.lang.ht.lex_attrs import like_num, norm_custom + + +def test_ht_tokenizer_handles_long_text(ht_tokenizer): + text = """Onè ap fèt pou ansyen lidè Pati Travayè Britanik + +Moun atravè lemond ap voye onè pou ansyen lidè +Pati Travayè a, John Smith, ki mouri pi bonè jodi a apre li te fè yon gwo kriz kadyak a laj 55 an. + +Nan Washington, Depatman Deta Etazini pibliye yon deklarasyon ki eksprime "regre lanmò twò bonè" avoka ak palmantè eskoze a. + +"Misye Smith, pandan tout karyè li ki te make ak distenksyon""" + tokens = ht_tokenizer(text) + assert len(tokens) == 84 + + + +@pytest.mark.parametrize( + "text,length", + [ + ("Map manje gato a pandan map gade televizyon lem lakay mwen.", 15), + ("M'ap vini, eske wap la avek lajan'm? Si oui, di'l non pou fre'w.", 22), + ("M ap teste sa (pou kounye a).", 10), + ], +) +def test_ht_tokenizer_handles_cnts(ht_tokenizer, text, length): + tokens = ht_tokenizer(text) + assert len(tokens) == length + + +@pytest.mark.parametrize( + "text,match", + [ + ("10", True), + ("1", True), + ("10,000", True), + ("10,00", True), + ("999.0", True), + ("en", True), + ("de", True), + ("milya", True), + ("dog", False), + (",", False), + ("1/2", True), + ], +) +def test_lex_attrs_like_number(ht_tokenizer, text, match): + tokens = ht_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].like_num == match + + +@pytest.mark.parametrize( + "word", ["ventyèm", "Milyonnyèm", "3yèm", "Santyèm", "25yèm", "52yèm"] +) +def test_ht_lex_attrs_like_number_for_ordinal(word): + assert like_num(word) + + +@pytest.mark.parametrize("word", ["onz"]) +def test_ht_lex_attrs_capitals(word): + assert like_num(word) + assert like_num(word.upper()) + + +@pytest.mark.parametrize( + "word, expected", [ + ("'m", "mwen"), + ("'n", "nou"), + ("'l", "li"), + ("'y", "yo"), + ("'w", "ou"), + ] +) +def test_ht_lex_attrs_norm_custom(word, expected): + assert norm_custom(word) == expected + diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index f946528ae..9818d5d7c 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -656,17 +656,12 @@ def test_spacy_blank(): @pytest.mark.parametrize( "lang,target", [ - ("en", "en"), ("fra", "fr"), ("fre", "fr"), ("iw", "he"), ("mo", "ro"), + ("scc", "sr"), ("mul", "xx"), - ("no", "nb"), - ("pt-BR", "pt"), - ("xx", "xx"), - ("zh-Hans", "zh"), - ("zh-Hant", None), ("zxx", None), ], ) @@ -686,11 +681,9 @@ def test_language_matching(lang, target): ("fre", "fr"), ("iw", "he"), ("mo", "ro"), + ("scc", "sr"), ("mul", "xx"), - ("no", "nb"), - ("pt-BR", "pt"), ("xx", "xx"), - ("zh-Hans", "zh"), ], ) def test_blank_languages(lang, target): diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 64b8d7c6c..a7faf0d62 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -479,10 +479,11 @@ cdef class Span: break elif i == self.doc.length - 1: yield Span(self.doc, start, self.doc.length) - - # Ensure that trailing parts of the Span instance are included in last element of .sents. - if start == self.doc.length - 1: - yield Span(self.doc, start, self.doc.length) + else: + # Ensure that trailing parts of the Span instance are included in last element of .sents. + # We only want to do this if we didn't break above + if start == self.doc.length - 1: + yield Span(self.doc, start, self.doc.length) @property def ents(self): diff --git a/spacy/util.py b/spacy/util.py index f1e68696b..527e6eb3a 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -5,7 +5,6 @@ import inspect import itertools import logging import os -import pkgutil import re import shlex import shutil @@ -40,7 +39,6 @@ from typing import ( ) import catalogue -import langcodes import numpy import srsly import thinc @@ -89,6 +87,83 @@ LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "grc", "id", "lb", "mk", "pt" # Default order of sections in the config file. Not all sections needs to exist, # and additional sections are added at the end, in alphabetical order. CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"] + +LANG_ALIASES = { + "af": ["afr"], + "am": ["amh"], + "ar": ["ara"], + "az": ["aze"], + "bg": ["bul"], + "bn": ["ben"], + "bo": ["bod", "tib"], + "ca": ["cat"], + "cs": ["ces", "cze"], + "da": ["dan"], + "de": ["deu", "ger"], + "el": ["ell", "gre"], + "en": ["eng"], + "es": ["spa"], + "et": ["est"], + "eu": ["eus", "baq"], + "fa": ["fas", "per"], + "fi": ["fin"], + "fo": ["fao"], + "fr": ["fra", "fre"], + "ga": ["gle"], + "gd": ["gla"], + "gu": ["guj"], + "he": ["heb", "iw"], # "iw" is the obsolete ISO 639-1 code for Hebrew + "hi": ["hin"], + "hr": ["hrv", "scr"], # "scr" is the deprecated ISO 639-2/B for Croatian + "hu": ["hun"], + "hy": ["hye"], + "id": ["ind", "in"], # "in" is the obsolete ISO 639-1 code for Hebrew + "is": ["isl", "ice"], + "it": ["ita"], + "ja": ["jpn"], + "kn": ["kan"], + "ko": ["kor"], + "ky": ["kir"], + "la": ["lat"], + "lb": ["ltz"], + "lg": ["lug"], + "lt": ["lit"], + "lv": ["lav"], + "mk": ["mkd", "mac"], + "ml": ["mal"], + "mr": ["mar"], + "ms": ["msa", "may"], + "nb": ["nob"], + "ne": ["nep"], + "nl": ["nld", "dut"], + "nn": ["nno"], + "pl": ["pol"], + "pt": ["por"], + "ro": ["ron", "rom", "mo", "mol"], # "mo" and "mol" are deprecated codes for Moldavian + "ru": ["rus"], + "sa": ["san"], + "si": ["sin"], + "sk": ["slk", "slo"], + "sl": ["slv"], + "sq": ["sqi", "alb"], + "sr": ["srp", "scc"], # "scc" is the deprecated ISO 639-2/B code for Serbian + "sv": ["swe"], + "ta": ["tam"], + "te": ["tel"], + "th": ["tha"], + "ti": ["tir"], + "tl": ["tgl"], + "tn": ["tsn"], + "tr": ["tur"], + "tt": ["tat"], + "uk": ["ukr"], + "ur": ["urd"], + "vi": ["viw"], + "yo": ["yor"], + "zh": ["zho", "chi"], + + "xx": ["mul"], +} # fmt: on logger = logging.getLogger("spacy") @@ -305,63 +380,39 @@ def lang_class_is_loaded(lang: str) -> bool: def find_matching_language(lang: str) -> Optional[str]: """ - Given an IETF language code, find a supported spaCy language that is a - close match for it (according to Unicode CLDR language-matching rules). - This allows for language aliases, ISO 639-2 codes, more detailed language - tags, and close matches. + Given a two-letter ISO 639-1 or three-letter ISO 639-3 language code, + find a supported spaCy language. Returns the language code if a matching language is available, or None if there is no matching language. - >>> find_matching_language('en') - 'en' - >>> find_matching_language('pt-BR') # Brazilian Portuguese - 'pt' - >>> find_matching_language('fra') # an ISO 639-2 code for French + >>> find_matching_language('fra') # ISO 639-3 code for French 'fr' - >>> find_matching_language('iw') # obsolete alias for Hebrew + >>> find_matching_language('fre') # ISO 639-2/B code for French + 'fr' + >>> find_matching_language('iw') # Obsolete ISO 639-1 code for Hebrew 'he' - >>> find_matching_language('no') # Norwegian - 'nb' - >>> find_matching_language('mo') # old code for ro-MD + >>> find_matching_language('mo') # Deprecated code for Moldavian 'ro' - >>> find_matching_language('zh-Hans') # Simplified Chinese - 'zh' + >>> find_matching_language('scc') # Deprecated ISO 639-2/B code for Serbian + 'sr' >>> find_matching_language('zxx') None """ import spacy.lang # noqa: F401 - if lang == "xx": - return "xx" + # Check aliases + for lang_code, aliases in LANG_ALIASES.items(): + if lang in aliases: + return lang_code - # Find out which language modules we have - possible_languages = [] - for modinfo in pkgutil.iter_modules(spacy.lang.__path__): # type: ignore[attr-defined] - code = modinfo.name - if code == "xx": - # Temporarily make 'xx' into a valid language code - possible_languages.append("mul") - elif langcodes.tag_is_valid(code): - possible_languages.append(code) - - # Distances from 1-9 allow near misses like Bosnian -> Croatian and - # Norwegian -> Norwegian Bokmål. A distance of 10 would include several - # more possibilities, like variants of Chinese like 'wuu', but text that - # is labeled that way is probably trying to be distinct from 'zh' and - # shouldn't automatically match. - match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9) - if match == "mul": - # Convert 'mul' back to spaCy's 'xx' - return "xx" - else: - return match + return None def get_lang_class(lang: str) -> Type["Language"]: """Import and load a Language class. - lang (str): IETF language code, such as 'en'. + lang (str): Two-letter ISO 639-1 or three-letter ISO 639-3 language code, such as 'en' and 'eng'. RETURNS (Language): Language class. """ # Check if language is registered / entry point is available @@ -372,13 +423,9 @@ def get_lang_class(lang: str) -> Type["Language"]: try: module = importlib.import_module(f".lang.{lang}", "spacy") except ImportError as err: - # Find a matching language. For example, if the language 'no' is - # requested, we can use language-matching to load `spacy.lang.nb`. - try: - match = find_matching_language(lang) - except langcodes.tag_parser.LanguageTagError: - # proceed to raising an import error - match = None + # Find a matching language. For example, if the language 'eng' is + # requested, we can use language-matching to load `spacy.lang.en`. + match = find_matching_language(lang) if match: lang = match diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index 6c47c8f1e..09a978259 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -230,7 +230,7 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr | Name | Description | | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `lang` | Pipeline language [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as `en`. ~~str (positional)~~ | +| `lang` | Pipeline language. Two-letter [ISO 639-1 code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3 code](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes), such as `en` and `eng`. ~~str (positional)~~ | | `vectors_loc` | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ | | `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ | | `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ | diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx index b969ddc53..a1c6601ab 100644 --- a/website/docs/api/language.mdx +++ b/website/docs/api/language.mdx @@ -1078,7 +1078,7 @@ details. | Name | Description | | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `Defaults` | Settings, data and factory methods for creating the `nlp` object and processing pipeline. ~~Defaults~~ | -| `lang` | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en' for English. ~~str~~ | +| `lang` | Two-letter [ISO 639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes) language codes, such as 'en' and 'eng' for English. ~~str~~ | | `default_config` | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](%%GITHUB_SPACY/spacy/default_config.cfg). ~~Config~~ | ## Defaults {id="defaults"} diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx index 225ff6e6a..d44015382 100644 --- a/website/docs/api/span.mdx +++ b/website/docs/api/span.mdx @@ -561,7 +561,7 @@ overlaps with will be returned. | `orth_` | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes. ~~str~~ | | `label` | The hash value of the span's label. ~~int~~ | | `label_` | The span's label. ~~str~~ | -| `lemma_` | The span's lemma. Equivalent to `"".join(token.text_with_ws for token in span)`. ~~str~~ | +| `lemma_` | The span's lemma. Equivalent to `"".join(token.lemma_ + token.whitespace_ for token in span).strip()`. ~~str~~ | | `kb_id` | The hash value of the knowledge base ID referred to by the span. ~~int~~ | | `kb_id_` | The knowledge base ID referred to by the span. ~~str~~ | | `ent_id` | The hash value of the named entity the root token is an instance of. ~~int~~ | diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx index 9cdc0c8ab..340f10f77 100644 --- a/website/docs/api/top-level.mdx +++ b/website/docs/api/top-level.mdx @@ -86,7 +86,7 @@ Create a blank pipeline of a given language class. This function is the twin of | Name | Description | | ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `name` | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en', of the language class to load. ~~str~~ | +| `name` | Two-letter [ISO 639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) or three-letter [ISO 639-3](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes) language codes, such as 'en' and 'eng', of the language class to load. ~~str~~ | | _keyword-only_ | | | `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ | | `config` 3 | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ | diff --git a/website/docs/api/vectors.mdx b/website/docs/api/vectors.mdx index 0e92eb12b..da2d7831a 100644 --- a/website/docs/api/vectors.mdx +++ b/website/docs/api/vectors.mdx @@ -51,7 +51,7 @@ modified later. | `strings` | The string store. A new string store is created if one is not provided. Defaults to `None`. ~~Optional[StringStore]~~ | | `shape` | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. ~~Tuple[int, int]~~ | | `data` | The vector data. ~~numpy.ndarray[ndim=2, dtype=float32]~~ | -| `keys` | A sequence of keys aligned with the data. ~~Iterable[Union[str, int]]~~ | +| `keys` | An iterable of keys aligned with the data. ~~Iterable[Union[str, int]]~~ | | `name` | A name to identify the vectors table. ~~str~~ | | `mode` 3.2 | Vectors mode: `"default"` or [`"floret"`](https://github.com/explosion/floret) (default: `"default"`). ~~str~~ | | `minn` 3.2 | The floret char ngram minn (default: `0`). ~~int~~ | diff --git a/website/docs/api/vocab.mdx b/website/docs/api/vocab.mdx index 57618397d..2466f561b 100644 --- a/website/docs/api/vocab.mdx +++ b/website/docs/api/vocab.mdx @@ -283,7 +283,7 @@ Serialize the current state to a binary string. | -------------- | ------------------------------------------------------------------------------------------- | | _keyword-only_ | | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | -| **RETURNS** | The serialized form of the `Vocab` object. ~~Vocab~~ | +| **RETURNS** | The serialized form of the `Vocab` object. ~~bytes~~ | ## Vocab.from_bytes {id="from_bytes",tag="method"}