mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-13 09:42:26 +03:00
Format
This commit is contained in:
parent
79f9d3ea2a
commit
80aa445f34
|
@ -22,10 +22,12 @@ class HaitianCreoleDefaults(BaseDefaults):
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
|
|
||||||
|
|
||||||
class HaitianCreole(Language):
|
class HaitianCreole(Language):
|
||||||
lang = "ht"
|
lang = "ht"
|
||||||
Defaults = HaitianCreoleDefaults
|
Defaults = HaitianCreoleDefaults
|
||||||
|
|
||||||
|
|
||||||
@HaitianCreole.factory(
|
@HaitianCreole.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
|
@ -49,4 +51,5 @@ def make_lemmatizer(
|
||||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["HaitianCreole"]
|
__all__ = ["HaitianCreole"]
|
||||||
|
|
|
@ -49,6 +49,7 @@ NORM_MAP = {
|
||||||
"P": "Pa",
|
"P": "Pa",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
text = text.strip().lower()
|
text = text.strip().lower()
|
||||||
if text.startswith(("+", "-", "±", "~")):
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
|
@ -69,9 +70,11 @@ def like_num(text):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def norm_custom(text):
|
def norm_custom(text):
|
||||||
return NORM_MAP.get(text, text.lower())
|
return NORM_MAP.get(text, text.lower())
|
||||||
|
|
||||||
|
|
||||||
LEX_ATTRS = {
|
LEX_ATTRS = {
|
||||||
LIKE_NUM: like_num,
|
LIKE_NUM: like_num,
|
||||||
NORM: norm_custom,
|
NORM: norm_custom,
|
||||||
|
|
|
@ -16,23 +16,37 @@ ELISION = "'’".replace(" ", "")
|
||||||
_prefixes_elision = "m n l y t k w"
|
_prefixes_elision = "m n l y t k w"
|
||||||
_prefixes_elision += " " + _prefixes_elision.upper()
|
_prefixes_elision += " " + _prefixes_elision.upper()
|
||||||
|
|
||||||
TOKENIZER_PREFIXES = LIST_PUNCT + LIST_QUOTES + [
|
TOKENIZER_PREFIXES = (
|
||||||
|
LIST_PUNCT
|
||||||
|
+ LIST_QUOTES
|
||||||
|
+ [
|
||||||
r"(?:({pe})[{el}])(?=[{a}])".format(
|
r"(?:({pe})[{el}])(?=[{a}])".format(
|
||||||
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
|
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
)
|
||||||
|
|
||||||
TOKENIZER_SUFFIXES = LIST_PUNCT + LIST_QUOTES + LIST_ELLIPSES + [
|
TOKENIZER_SUFFIXES = (
|
||||||
|
LIST_PUNCT
|
||||||
|
+ LIST_QUOTES
|
||||||
|
+ LIST_ELLIPSES
|
||||||
|
+ [
|
||||||
r"(?<=[0-9])%", # numbers like 10%
|
r"(?<=[0-9])%", # numbers like 10%
|
||||||
r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers
|
r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers
|
||||||
r"(?<=[{a}])['’]".format(a=ALPHA), # apostrophes after letters
|
r"(?<=[{a}])['’]".format(a=ALPHA), # apostrophes after letters
|
||||||
r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions
|
r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions
|
||||||
r"(?<=[{a}0-9])\)", # right parenthesis after letter/number
|
r"(?<=[{a}0-9])\)", # right parenthesis after letter/number
|
||||||
r"(?<=[{a}])\.(?=\s|$)".format(a=ALPHA), # period after letter if space or end of string
|
r"(?<=[{a}])\.(?=\s|$)".format(
|
||||||
|
a=ALPHA
|
||||||
|
), # period after letter if space or end of string
|
||||||
r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis
|
r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis
|
||||||
]
|
]
|
||||||
|
)
|
||||||
|
|
||||||
TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [
|
TOKENIZER_INFIXES = (
|
||||||
|
LIST_ELLIPSES
|
||||||
|
+ LIST_ICONS
|
||||||
|
+ [
|
||||||
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||||
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||||
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||||
|
@ -41,3 +55,4 @@ TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [
|
||||||
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
|
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
|
||||||
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
|
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
|
||||||
]
|
]
|
||||||
|
)
|
||||||
|
|
|
@ -39,8 +39,7 @@ sa san si swa si
|
||||||
|
|
||||||
men mèsi oswa osinon
|
men mèsi oswa osinon
|
||||||
|
|
||||||
"""
|
""".split()
|
||||||
.split()
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add common contractions, with and without apostrophe variants
|
# Add common contractions, with and without apostrophe variants
|
||||||
|
|
|
@ -1,4 +1,22 @@
|
||||||
from spacy.symbols import NOUN, VERB, AUX, ADJ, ADV, PRON, DET, ADP, SCONJ, CCONJ, PART, INTJ, NUM, PROPN, PUNCT, SYM, X
|
from spacy.symbols import (
|
||||||
|
NOUN,
|
||||||
|
VERB,
|
||||||
|
AUX,
|
||||||
|
ADJ,
|
||||||
|
ADV,
|
||||||
|
PRON,
|
||||||
|
DET,
|
||||||
|
ADP,
|
||||||
|
SCONJ,
|
||||||
|
CCONJ,
|
||||||
|
PART,
|
||||||
|
INTJ,
|
||||||
|
NUM,
|
||||||
|
PROPN,
|
||||||
|
PUNCT,
|
||||||
|
SYM,
|
||||||
|
X,
|
||||||
|
)
|
||||||
|
|
||||||
TAG_MAP = {
|
TAG_MAP = {
|
||||||
"NOUN": {"pos": NOUN},
|
"NOUN": {"pos": NOUN},
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
from spacy.symbols import ORTH, NORM
|
from spacy.symbols import ORTH, NORM
|
||||||
|
|
||||||
|
|
||||||
def make_variants(base, first_norm, second_orth, second_norm):
|
def make_variants(base, first_norm, second_orth, second_norm):
|
||||||
return {
|
return {
|
||||||
base: [
|
base: [
|
||||||
|
@ -7,14 +8,16 @@ def make_variants(base, first_norm, second_orth, second_norm):
|
||||||
{ORTH: second_orth, NORM: second_norm},
|
{ORTH: second_orth, NORM: second_norm},
|
||||||
],
|
],
|
||||||
base.capitalize(): [
|
base.capitalize(): [
|
||||||
{ORTH: base.split("'")[0].capitalize() + "'", NORM: first_norm.capitalize()},
|
{
|
||||||
|
ORTH: base.split("'")[0].capitalize() + "'",
|
||||||
|
NORM: first_norm.capitalize(),
|
||||||
|
},
|
||||||
{ORTH: second_orth, NORM: second_norm},
|
{ORTH: second_orth, NORM: second_norm},
|
||||||
]
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = {
|
|
||||||
"Dr.": [{ORTH: "Dr."}]
|
TOKENIZER_EXCEPTIONS = {"Dr.": [{ORTH: "Dr."}]}
|
||||||
}
|
|
||||||
|
|
||||||
# Apostrophe forms
|
# Apostrophe forms
|
||||||
TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap"))
|
TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap"))
|
||||||
|
@ -29,7 +32,8 @@ TOKENIZER_EXCEPTIONS.update(make_variants("p'ap", "pa", "ap", "ap"))
|
||||||
TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap"))
|
TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap"))
|
||||||
|
|
||||||
# Non-apostrophe contractions (with capitalized variants)
|
# Non-apostrophe contractions (with capitalized variants)
|
||||||
TOKENIZER_EXCEPTIONS.update({
|
TOKENIZER_EXCEPTIONS.update(
|
||||||
|
{
|
||||||
"map": [
|
"map": [
|
||||||
{ORTH: "m", NORM: "mwen"},
|
{ORTH: "m", NORM: "mwen"},
|
||||||
{ORTH: "ap", NORM: "ap"},
|
{ORTH: "ap", NORM: "ap"},
|
||||||
|
@ -118,4 +122,5 @@ TOKENIZER_EXCEPTIONS.update({
|
||||||
{ORTH: "T", NORM: "Te"},
|
{ORTH: "T", NORM: "Te"},
|
||||||
{ORTH: "ap", NORM: "ap"},
|
{ORTH: "ap", NORM: "ap"},
|
||||||
],
|
],
|
||||||
})
|
}
|
||||||
|
)
|
||||||
|
|
|
@ -29,4 +29,16 @@ def test_ht_tokenizer_handles_basic_abbreviation(ht_tokenizer, text):
|
||||||
def test_ht_tokenizer_full_sentence(ht_tokenizer):
|
def test_ht_tokenizer_full_sentence(ht_tokenizer):
|
||||||
text = "Si'm ka vini, m'ap pale ak li."
|
text = "Si'm ka vini, m'ap pale ak li."
|
||||||
tokens = [t.text for t in ht_tokenizer(text)]
|
tokens = [t.text for t in ht_tokenizer(text)]
|
||||||
assert tokens == ["Si", "'m", "ka", "vini", ",", "m'", "ap", "pale", "ak", "li", "."]
|
assert tokens == [
|
||||||
|
"Si",
|
||||||
|
"'m",
|
||||||
|
"ka",
|
||||||
|
"vini",
|
||||||
|
",",
|
||||||
|
"m'",
|
||||||
|
"ap",
|
||||||
|
"pale",
|
||||||
|
"ak",
|
||||||
|
"li",
|
||||||
|
".",
|
||||||
|
]
|
||||||
|
|
|
@ -37,7 +37,9 @@ def test_ht_tokenizer_splits_uneven_wrap(ht_tokenizer, text):
|
||||||
assert len(tokens) == 5
|
assert len(tokens) == 5
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)])
|
@pytest.mark.parametrize(
|
||||||
|
"text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)]
|
||||||
|
)
|
||||||
def test_ht_tokenizer_splits_prefix_interact(ht_tokenizer, text, length):
|
def test_ht_tokenizer_splits_prefix_interact(ht_tokenizer, text, length):
|
||||||
tokens = ht_tokenizer(text)
|
tokens = ht_tokenizer(text)
|
||||||
assert len(tokens) == length
|
assert len(tokens) == length
|
||||||
|
|
|
@ -16,7 +16,6 @@ Nan Washington, Depatman Deta Etazini pibliye yon deklarasyon ki eksprime "regre
|
||||||
assert len(tokens) == 84
|
assert len(tokens) == 84
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"text,length",
|
"text,length",
|
||||||
[
|
[
|
||||||
|
@ -66,14 +65,14 @@ def test_ht_lex_attrs_capitals(word):
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"word, expected", [
|
"word, expected",
|
||||||
|
[
|
||||||
("'m", "mwen"),
|
("'m", "mwen"),
|
||||||
("'n", "nou"),
|
("'n", "nou"),
|
||||||
("'l", "li"),
|
("'l", "li"),
|
||||||
("'y", "yo"),
|
("'y", "yo"),
|
||||||
("'w", "ou"),
|
("'w", "ou"),
|
||||||
]
|
],
|
||||||
)
|
)
|
||||||
def test_ht_lex_attrs_norm_custom(word, expected):
|
def test_ht_lex_attrs_norm_custom(word, expected):
|
||||||
assert norm_custom(word) == expected
|
assert norm_custom(word) == expected
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user