mirror of
https://github.com/explosion/spaCy.git
synced 2025-09-19 18:42:37 +03:00
Merge 972f8a6354
into 41e07772dc
This commit is contained in:
commit
c88360f2f5
4
.github/workflows/tests.yml
vendored
4
.github/workflows/tests.yml
vendored
|
@ -54,9 +54,9 @@ jobs:
|
|||
|
||||
tests:
|
||||
name: Test
|
||||
needs: Validate
|
||||
needs: validate
|
||||
strategy:
|
||||
fail-fast: true
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [ubuntu-latest, windows-latest, macos-latest]
|
||||
python_version: ["3.9", "3.12", "3.13"]
|
||||
|
|
|
@ -99,7 +99,7 @@ def parse_config_overrides(
|
|||
RETURNS (Dict[str, Any]): The parsed dict, keyed by nested config setting.
|
||||
"""
|
||||
env_string = os.environ.get(env_var, "") if env_var else ""
|
||||
env_overrides = _parse_overrides(split_arg_string(env_string))
|
||||
env_overrides = _parse_overrides(split_arg_string(env_string)) # type: ignore[operator]
|
||||
cli_overrides = _parse_overrides(args, is_cli=True)
|
||||
if cli_overrides:
|
||||
keys = [k for k in cli_overrides if k not in env_overrides]
|
||||
|
|
|
@ -5,11 +5,11 @@ from thinc.api import Model
|
|||
from ...language import BaseDefaults, Language
|
||||
from .lemmatizer import HaitianCreoleLemmatizer
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .tag_map import TAG_MAP
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
|
||||
|
||||
class HaitianCreoleDefaults(BaseDefaults):
|
||||
|
@ -22,10 +22,12 @@ class HaitianCreoleDefaults(BaseDefaults):
|
|||
stop_words = STOP_WORDS
|
||||
tag_map = TAG_MAP
|
||||
|
||||
|
||||
class HaitianCreole(Language):
|
||||
lang = "ht"
|
||||
Defaults = HaitianCreoleDefaults
|
||||
|
||||
|
||||
@HaitianCreole.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
|
@ -49,4 +51,5 @@ def make_lemmatizer(
|
|||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["HaitianCreole"]
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
from typing import List, Tuple
|
||||
|
||||
from ...lookups import Lookups
|
||||
from ...pipeline import Lemmatizer
|
||||
from ...tokens import Token
|
||||
from ...lookups import Lookups
|
||||
|
||||
|
||||
class HaitianCreoleLemmatizer(Lemmatizer):
|
||||
|
|
|
@ -49,6 +49,7 @@ NORM_MAP = {
|
|||
"P": "Pa",
|
||||
}
|
||||
|
||||
|
||||
def like_num(text):
|
||||
text = text.strip().lower()
|
||||
if text.startswith(("+", "-", "±", "~")):
|
||||
|
@ -69,9 +70,11 @@ def like_num(text):
|
|||
return True
|
||||
return False
|
||||
|
||||
|
||||
def norm_custom(text):
|
||||
return NORM_MAP.get(text, text.lower())
|
||||
|
||||
|
||||
LEX_ATTRS = {
|
||||
LIKE_NUM: like_num,
|
||||
NORM: norm_custom,
|
||||
|
|
|
@ -4,10 +4,10 @@ from ..char_classes import (
|
|||
ALPHA_UPPER,
|
||||
CONCAT_QUOTES,
|
||||
HYPHENS,
|
||||
LIST_PUNCT,
|
||||
LIST_QUOTES,
|
||||
LIST_ELLIPSES,
|
||||
LIST_ICONS,
|
||||
LIST_PUNCT,
|
||||
LIST_QUOTES,
|
||||
merge_chars,
|
||||
)
|
||||
|
||||
|
@ -16,28 +16,43 @@ ELISION = "'’".replace(" ", "")
|
|||
_prefixes_elision = "m n l y t k w"
|
||||
_prefixes_elision += " " + _prefixes_elision.upper()
|
||||
|
||||
TOKENIZER_PREFIXES = LIST_PUNCT + LIST_QUOTES + [
|
||||
r"(?:({pe})[{el}])(?=[{a}])".format(
|
||||
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
|
||||
)
|
||||
]
|
||||
TOKENIZER_PREFIXES = (
|
||||
LIST_PUNCT
|
||||
+ LIST_QUOTES
|
||||
+ [
|
||||
r"(?:({pe})[{el}])(?=[{a}])".format(
|
||||
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
TOKENIZER_SUFFIXES = LIST_PUNCT + LIST_QUOTES + LIST_ELLIPSES + [
|
||||
r"(?<=[0-9])%", # numbers like 10%
|
||||
r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers
|
||||
r"(?<=[{a}])['’]".format(a=ALPHA), # apostrophes after letters
|
||||
r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions
|
||||
r"(?<=[{a}0-9])\)", # right parenthesis after letter/number
|
||||
r"(?<=[{a}])\.(?=\s|$)".format(a=ALPHA), # period after letter if space or end of string
|
||||
r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis
|
||||
]
|
||||
TOKENIZER_SUFFIXES = (
|
||||
LIST_PUNCT
|
||||
+ LIST_QUOTES
|
||||
+ LIST_ELLIPSES
|
||||
+ [
|
||||
r"(?<=[0-9])%", # numbers like 10%
|
||||
r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers
|
||||
r"(?<=[{a}])['’]".format(a=ALPHA), # apostrophes after letters
|
||||
r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions
|
||||
r"(?<=[{a}0-9])\)", # right parenthesis after letter/number
|
||||
r"(?<=[{a}])\.(?=\s|$)".format(
|
||||
a=ALPHA
|
||||
), # period after letter if space or end of string
|
||||
r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis
|
||||
]
|
||||
)
|
||||
|
||||
TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [
|
||||
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||
),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
|
||||
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
|
||||
]
|
||||
TOKENIZER_INFIXES = (
|
||||
LIST_ELLIPSES
|
||||
+ LIST_ICONS
|
||||
+ [
|
||||
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||
),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
|
||||
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
|
||||
]
|
||||
)
|
||||
|
|
|
@ -39,8 +39,7 @@ sa san si swa si
|
|||
|
||||
men mèsi oswa osinon
|
||||
|
||||
"""
|
||||
.split()
|
||||
""".split()
|
||||
)
|
||||
|
||||
# Add common contractions, with and without apostrophe variants
|
||||
|
|
|
@ -1,4 +1,22 @@
|
|||
from spacy.symbols import NOUN, VERB, AUX, ADJ, ADV, PRON, DET, ADP, SCONJ, CCONJ, PART, INTJ, NUM, PROPN, PUNCT, SYM, X
|
||||
from spacy.symbols import (
|
||||
ADJ,
|
||||
ADP,
|
||||
ADV,
|
||||
AUX,
|
||||
CCONJ,
|
||||
DET,
|
||||
INTJ,
|
||||
NOUN,
|
||||
NUM,
|
||||
PART,
|
||||
PRON,
|
||||
PROPN,
|
||||
PUNCT,
|
||||
SCONJ,
|
||||
SYM,
|
||||
VERB,
|
||||
X,
|
||||
)
|
||||
|
||||
TAG_MAP = {
|
||||
"NOUN": {"pos": NOUN},
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
from spacy.symbols import ORTH, NORM
|
||||
from spacy.symbols import NORM, ORTH
|
||||
|
||||
|
||||
def make_variants(base, first_norm, second_orth, second_norm):
|
||||
return {
|
||||
|
@ -7,14 +8,16 @@ def make_variants(base, first_norm, second_orth, second_norm):
|
|||
{ORTH: second_orth, NORM: second_norm},
|
||||
],
|
||||
base.capitalize(): [
|
||||
{ORTH: base.split("'")[0].capitalize() + "'", NORM: first_norm.capitalize()},
|
||||
{
|
||||
ORTH: base.split("'")[0].capitalize() + "'",
|
||||
NORM: first_norm.capitalize(),
|
||||
},
|
||||
{ORTH: second_orth, NORM: second_norm},
|
||||
]
|
||||
],
|
||||
}
|
||||
|
||||
TOKENIZER_EXCEPTIONS = {
|
||||
"Dr.": [{ORTH: "Dr."}]
|
||||
}
|
||||
|
||||
TOKENIZER_EXCEPTIONS = {"Dr.": [{ORTH: "Dr."}]}
|
||||
|
||||
# Apostrophe forms
|
||||
TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap"))
|
||||
|
@ -29,93 +32,95 @@ TOKENIZER_EXCEPTIONS.update(make_variants("p'ap", "pa", "ap", "ap"))
|
|||
TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap"))
|
||||
|
||||
# Non-apostrophe contractions (with capitalized variants)
|
||||
TOKENIZER_EXCEPTIONS.update({
|
||||
"map": [
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Map": [
|
||||
{ORTH: "M", NORM: "Mwen"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"lem": [
|
||||
{ORTH: "le", NORM: "le"},
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
],
|
||||
"Lem": [
|
||||
{ORTH: "Le", NORM: "Le"},
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
],
|
||||
"lew": [
|
||||
{ORTH: "le", NORM: "le"},
|
||||
{ORTH: "w", NORM: "ou"},
|
||||
],
|
||||
"Lew": [
|
||||
{ORTH: "Le", NORM: "Le"},
|
||||
{ORTH: "w", NORM: "ou"},
|
||||
],
|
||||
"nap": [
|
||||
{ORTH: "n", NORM: "nou"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Nap": [
|
||||
{ORTH: "N", NORM: "Nou"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"lap": [
|
||||
{ORTH: "l", NORM: "li"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Lap": [
|
||||
{ORTH: "L", NORM: "Li"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"yap": [
|
||||
{ORTH: "y", NORM: "yo"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Yap": [
|
||||
{ORTH: "Y", NORM: "Yo"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"mte": [
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
{ORTH: "te", NORM: "te"},
|
||||
],
|
||||
"Mte": [
|
||||
{ORTH: "M", NORM: "Mwen"},
|
||||
{ORTH: "te", NORM: "te"},
|
||||
],
|
||||
"mpral": [
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
{ORTH: "pral", NORM: "pral"},
|
||||
],
|
||||
"Mpral": [
|
||||
{ORTH: "M", NORM: "Mwen"},
|
||||
{ORTH: "pral", NORM: "pral"},
|
||||
],
|
||||
"wap": [
|
||||
{ORTH: "w", NORM: "ou"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Wap": [
|
||||
{ORTH: "W", NORM: "Ou"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"kap": [
|
||||
{ORTH: "k", NORM: "ki"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Kap": [
|
||||
{ORTH: "K", NORM: "Ki"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"tap": [
|
||||
{ORTH: "t", NORM: "te"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Tap": [
|
||||
{ORTH: "T", NORM: "Te"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
})
|
||||
TOKENIZER_EXCEPTIONS.update(
|
||||
{
|
||||
"map": [
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Map": [
|
||||
{ORTH: "M", NORM: "Mwen"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"lem": [
|
||||
{ORTH: "le", NORM: "le"},
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
],
|
||||
"Lem": [
|
||||
{ORTH: "Le", NORM: "Le"},
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
],
|
||||
"lew": [
|
||||
{ORTH: "le", NORM: "le"},
|
||||
{ORTH: "w", NORM: "ou"},
|
||||
],
|
||||
"Lew": [
|
||||
{ORTH: "Le", NORM: "Le"},
|
||||
{ORTH: "w", NORM: "ou"},
|
||||
],
|
||||
"nap": [
|
||||
{ORTH: "n", NORM: "nou"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Nap": [
|
||||
{ORTH: "N", NORM: "Nou"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"lap": [
|
||||
{ORTH: "l", NORM: "li"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Lap": [
|
||||
{ORTH: "L", NORM: "Li"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"yap": [
|
||||
{ORTH: "y", NORM: "yo"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Yap": [
|
||||
{ORTH: "Y", NORM: "Yo"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"mte": [
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
{ORTH: "te", NORM: "te"},
|
||||
],
|
||||
"Mte": [
|
||||
{ORTH: "M", NORM: "Mwen"},
|
||||
{ORTH: "te", NORM: "te"},
|
||||
],
|
||||
"mpral": [
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
{ORTH: "pral", NORM: "pral"},
|
||||
],
|
||||
"Mpral": [
|
||||
{ORTH: "M", NORM: "Mwen"},
|
||||
{ORTH: "pral", NORM: "pral"},
|
||||
],
|
||||
"wap": [
|
||||
{ORTH: "w", NORM: "ou"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Wap": [
|
||||
{ORTH: "W", NORM: "Ou"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"kap": [
|
||||
{ORTH: "k", NORM: "ki"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Kap": [
|
||||
{ORTH: "K", NORM: "Ki"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"tap": [
|
||||
{ORTH: "t", NORM: "te"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Tap": [
|
||||
{ORTH: "T", NORM: "Te"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
}
|
||||
)
|
||||
|
|
|
@ -187,7 +187,7 @@ class Lemmatizer(Pipe):
|
|||
if univ_pos == "":
|
||||
warnings.warn(Warnings.W108)
|
||||
return [string.lower()]
|
||||
# See Issue #435 for example of where this logic is requied.
|
||||
# See Issue #435 for example of where this logic is required.
|
||||
if self.is_base_form(token):
|
||||
return [string.lower()]
|
||||
index_table = self.lookups.get_table("lemma_index", {})
|
||||
|
@ -210,7 +210,7 @@ class Lemmatizer(Pipe):
|
|||
rules = rules_table.get(univ_pos, {})
|
||||
orig = string
|
||||
string = string.lower()
|
||||
forms = []
|
||||
forms: List[str] = []
|
||||
oov_forms = []
|
||||
for old, new in rules:
|
||||
if string.endswith(old):
|
||||
|
|
|
@ -29,4 +29,16 @@ def test_ht_tokenizer_handles_basic_abbreviation(ht_tokenizer, text):
|
|||
def test_ht_tokenizer_full_sentence(ht_tokenizer):
|
||||
text = "Si'm ka vini, m'ap pale ak li."
|
||||
tokens = [t.text for t in ht_tokenizer(text)]
|
||||
assert tokens == ["Si", "'m", "ka", "vini", ",", "m'", "ap", "pale", "ak", "li", "."]
|
||||
assert tokens == [
|
||||
"Si",
|
||||
"'m",
|
||||
"ka",
|
||||
"vini",
|
||||
",",
|
||||
"m'",
|
||||
"ap",
|
||||
"pale",
|
||||
"ak",
|
||||
"li",
|
||||
".",
|
||||
]
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import pytest
|
||||
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
|
|
|
@ -37,7 +37,9 @@ def test_ht_tokenizer_splits_uneven_wrap(ht_tokenizer, text):
|
|||
assert len(tokens) == 5
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)])
|
||||
@pytest.mark.parametrize(
|
||||
"text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)]
|
||||
)
|
||||
def test_ht_tokenizer_splits_prefix_interact(ht_tokenizer, text, length):
|
||||
tokens = ht_tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
|
|
@ -16,7 +16,6 @@ Nan Washington, Depatman Deta Etazini pibliye yon deklarasyon ki eksprime "regre
|
|||
assert len(tokens) == 84
|
||||
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,length",
|
||||
[
|
||||
|
@ -66,14 +65,14 @@ def test_ht_lex_attrs_capitals(word):
|
|||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"word, expected", [
|
||||
"word, expected",
|
||||
[
|
||||
("'m", "mwen"),
|
||||
("'n", "nou"),
|
||||
("'l", "li"),
|
||||
("'y", "yo"),
|
||||
("'w", "ou"),
|
||||
]
|
||||
],
|
||||
)
|
||||
def test_ht_lex_attrs_norm_custom(word, expected):
|
||||
assert norm_custom(word) == expected
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user